{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Predicting Youtube Spam or Ham with Machine Learning in Python\n",
    "##### Feature Extraction and Text Classification In Python\n",
    "+ Sklearn\n",
    "+ Pandas\n",
    "\n",
    "##### Idea: Converting Words into Vectors to Use as Features to help in classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# EDA Packages\n",
    "import pandas as pd\n",
    "import numpy as np\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ML Packages For Vectorization of Text For Feature Extraction\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualization Packages\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dataset from https://archive.ics.uci.edu/ml/datasets/YouTube+Spam+Collection#\n",
    "df1 = pd.read_csv(\"Youtube01-Psy.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>COMMENT_ID</th>\n",
       "      <th>AUTHOR</th>\n",
       "      <th>DATE</th>\n",
       "      <th>CONTENT</th>\n",
       "      <th>CLASS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU</td>\n",
       "      <td>Julius NM</td>\n",
       "      <td>2013-11-07T06:20:48</td>\n",
       "      <td>Huh, anyway check out this you[tube] channel: ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A</td>\n",
       "      <td>adam riyati</td>\n",
       "      <td>2013-11-07T12:37:15</td>\n",
       "      <td>Hey guys check out my new channel and our firs...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8</td>\n",
       "      <td>Evgeny Murashkin</td>\n",
       "      <td>2013-11-08T17:34:21</td>\n",
       "      <td>just for test I have to say murdev.com</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>z13jhp0bxqncu512g22wvzkasxmvvzjaz04</td>\n",
       "      <td>ElNino Melendez</td>\n",
       "      <td>2013-11-09T08:28:43</td>\n",
       "      <td>me shaking my sexy ass on my channel enjoy ^_^ ﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>z13fwbwp1oujthgqj04chlngpvzmtt3r3dw</td>\n",
       "      <td>GsMega</td>\n",
       "      <td>2013-11-10T16:05:38</td>\n",
       "      <td>watch?v=vtaRGgvGtWQ   Check this out .﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    COMMENT_ID            AUTHOR  \\\n",
       "0  LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU         Julius NM   \n",
       "1  LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A       adam riyati   \n",
       "2  LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8  Evgeny Murashkin   \n",
       "3          z13jhp0bxqncu512g22wvzkasxmvvzjaz04   ElNino Melendez   \n",
       "4          z13fwbwp1oujthgqj04chlngpvzmtt3r3dw            GsMega   \n",
       "\n",
       "                  DATE                                            CONTENT  \\\n",
       "0  2013-11-07T06:20:48  Huh, anyway check out this you[tube] channel: ...   \n",
       "1  2013-11-07T12:37:15  Hey guys check out my new channel and our firs...   \n",
       "2  2013-11-08T17:34:21             just for test I have to say murdev.com   \n",
       "3  2013-11-09T08:28:43   me shaking my sexy ass on my channel enjoy ^_^ ﻿   \n",
       "4  2013-11-10T16:05:38            watch?v=vtaRGgvGtWQ   Check this out .﻿   \n",
       "\n",
       "   CLASS  \n",
       "0      1  \n",
       "1      1  \n",
       "2      1  \n",
       "3      1  \n",
       "4      1  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load all our dataset to merge them\n",
    "df2 = pd.read_csv(\"Youtube02-KatyPerry.csv\")\n",
    "df3 = pd.read_csv(\"Youtube03-LMFAO.csv\")\n",
    "df4 = pd.read_csv(\"Youtube04-Eminem.csv\")\n",
    "df5 = pd.read_csv(\"Youtube05-Shakira.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "frames = [df1,df2,df3,df4,df5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merging or Concatenating our DF\n",
    "df_merged = pd.concat(frames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>COMMENT_ID</th>\n",
       "      <th>AUTHOR</th>\n",
       "      <th>DATE</th>\n",
       "      <th>CONTENT</th>\n",
       "      <th>CLASS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU</td>\n",
       "      <td>Julius NM</td>\n",
       "      <td>2013-11-07T06:20:48</td>\n",
       "      <td>Huh, anyway check out this you[tube] channel: ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A</td>\n",
       "      <td>adam riyati</td>\n",
       "      <td>2013-11-07T12:37:15</td>\n",
       "      <td>Hey guys check out my new channel and our firs...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8</td>\n",
       "      <td>Evgeny Murashkin</td>\n",
       "      <td>2013-11-08T17:34:21</td>\n",
       "      <td>just for test I have to say murdev.com</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>z13jhp0bxqncu512g22wvzkasxmvvzjaz04</td>\n",
       "      <td>ElNino Melendez</td>\n",
       "      <td>2013-11-09T08:28:43</td>\n",
       "      <td>me shaking my sexy ass on my channel enjoy ^_^ ﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>z13fwbwp1oujthgqj04chlngpvzmtt3r3dw</td>\n",
       "      <td>GsMega</td>\n",
       "      <td>2013-11-10T16:05:38</td>\n",
       "      <td>watch?v=vtaRGgvGtWQ   Check this out .﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>LZQPQhLyRh9-wNRtlZDM90f1k0BrdVdJyN_YsaSwfxc</td>\n",
       "      <td>Jason Haddad</td>\n",
       "      <td>2013-11-26T02:55:11</td>\n",
       "      <td>Hey, check out my new website!! This site is a...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>z13lfzdo5vmdi1cm123te5uz2mqig1brz04</td>\n",
       "      <td>ferleck ferles</td>\n",
       "      <td>2013-11-27T21:39:24</td>\n",
       "      <td>Subscribe to my channel ﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>z122wfnzgt30fhubn04cdn3xfx2mxzngsl40k</td>\n",
       "      <td>Bob Kanowski</td>\n",
       "      <td>2013-11-28T12:33:27</td>\n",
       "      <td>i turned it on mute as soon is i came on i jus...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>z13ttt1jcraqexk2o234ghbgzxymz1zzi04</td>\n",
       "      <td>Cony</td>\n",
       "      <td>2013-11-28T16:01:47</td>\n",
       "      <td>You should check my channel for Funny VIDEOS!!﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>z12avveb4xqiirsix04chxviiljryduwxg0</td>\n",
       "      <td>BeBe Burkey</td>\n",
       "      <td>2013-11-28T16:30:13</td>\n",
       "      <td>and u should.d check my channel and tell me wh...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>z13auhww3oufjn1qo04ci3grqqjmfjexxuo0k</td>\n",
       "      <td>Huckyduck</td>\n",
       "      <td>2013-11-28T17:06:17</td>\n",
       "      <td>Hey subscribe to me﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>z13xit5agm2zyh4f523rst2gowmbx5bml</td>\n",
       "      <td>Lone Twistt</td>\n",
       "      <td>2013-11-28T17:34:55</td>\n",
       "      <td>Once you have started reading do not stop. If...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>z13pejoiuozwxtdu323dspopnri4xts0f</td>\n",
       "      <td>Archie Lewis</td>\n",
       "      <td>2013-11-28T17:54:39</td>\n",
       "      <td>https://twitter.com/GBphotographyGB﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>z121zxaxsq25z5k5o04ch1o5jqqfij3gtm40k</td>\n",
       "      <td>TheUploadaddict</td>\n",
       "      <td>2013-11-28T18:12:12</td>\n",
       "      <td>subscribe like comment﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>z12oglnpoq3gjh4om04cfdlbgp2uepyytpw0k</td>\n",
       "      <td>Francisco Nora</td>\n",
       "      <td>2013-11-28T19:52:35</td>\n",
       "      <td>please like :D https://premium.easypromosapp.c...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>z13phrmwrkfisn5er22eyrbpbvaiwfvwf04</td>\n",
       "      <td>Gaming and Stuff PRO</td>\n",
       "      <td>2013-11-28T21:14:13</td>\n",
       "      <td>Hello! Do you like gaming, art videos, scienti...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>z13bgdvyluihfv11i22rgxwhuvabzz1os04</td>\n",
       "      <td>Zielimeek21</td>\n",
       "      <td>2013-11-28T21:49:00</td>\n",
       "      <td>I'm only checking the views﻿</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>z13vxpnoxsyeuv2jr04cctprprb1slnxdf4</td>\n",
       "      <td>OutrightIgnite</td>\n",
       "      <td>2013-11-28T21:55:02</td>\n",
       "      <td>http://www.ebay.com/itm/171183229277?ssPageNam...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>z12qth5j0ob1fx3q404chvy4fz32tbkpllk0k</td>\n",
       "      <td>Tony K Frazier</td>\n",
       "      <td>2013-11-28T23:57:13</td>\n",
       "      <td>http://ubuntuone.com/40beUutVu2ZKxK4uTgPZ8K﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>z13etj0bclzfztuwc04cgfvrgmf3fvjor1g</td>\n",
       "      <td>Jose Renteria</td>\n",
       "      <td>2013-11-29T00:22:01</td>\n",
       "      <td>We are an EDM apparel company dedicated to bri...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>z12axnji5w2axxht522thb3bktvqjdlbp04</td>\n",
       "      <td>zhichao wang</td>\n",
       "      <td>2013-11-29T02:13:56</td>\n",
       "      <td>i think about 100 millions of the views come f...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>z13ozdmr4lf3uzc5z04cix2zkyjzgvcyemw0k</td>\n",
       "      <td>Carlos Thegamer</td>\n",
       "      <td>2013-12-01T01:20:21</td>\n",
       "      <td>subscribe to my channel people :D﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>z12ohdxjtsatvppjb04cctprprb1slnxdf4</td>\n",
       "      <td>OutrightIgnite</td>\n",
       "      <td>2013-12-01T03:30:55</td>\n",
       "      <td>Show your AUBURN PRIDE HERE: http://www.teespr...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>z12ntlcqht2bvjewi04cf1up0xjvs5lq3mc0k</td>\n",
       "      <td>Owen Lai</td>\n",
       "      <td>2013-12-01T04:51:52</td>\n",
       "      <td>just checking the views﻿</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>LZQPQhLyRh9EXArr4ZnVcDonSbvSMHKYOT24e_qR6fE</td>\n",
       "      <td>||GuitarZ||</td>\n",
       "      <td>2013-12-23T12:54:38</td>\n",
       "      <td>CHECK OUT MY CHANNEL</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>LZQPQhLyRh9y57URF7qpZRk3MVAJNLNhhZga_5YWBU8</td>\n",
       "      <td>Living4Techno</td>\n",
       "      <td>2013-12-25T19:46:26</td>\n",
       "      <td>marketglory . com/strategygame/andrijamatf ear...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>LZQPQhLyRh9vw01Xvvw5yWzZEUOPG1hSgRMHep55-Yw</td>\n",
       "      <td>8-BitMusic</td>\n",
       "      <td>2013-12-27T23:07:50</td>\n",
       "      <td>Hey guys! Im a 12 yr old music producer. I mak...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>z13kszcinpnvc34v2234fnpxkpmlw3nhc04</td>\n",
       "      <td>Kyle Jaber</td>\n",
       "      <td>2014-01-19T00:21:29</td>\n",
       "      <td>Check me out! I'm kyle. I rap so yeah ﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>z13tj514otzlurfbc04ccjwhrnmej1iihqw0k</td>\n",
       "      <td>Brandon Pryor</td>\n",
       "      <td>2014-01-19T00:36:25</td>\n",
       "      <td>I dont even watch it anymore i just come here ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>z13zvh1rmk3cf3mby04civbq5mjtddmbysk0k</td>\n",
       "      <td>Fun&amp;amp;Hacks</td>\n",
       "      <td>2014-01-19T00:42:35</td>\n",
       "      <td>Subscribe to me for free Android games, apps.. ﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>340</th>\n",
       "      <td>_2viQ_Qnc6-1oZCLsUWjl3-g4QrWxMQXsSTs2Hy4MGI</td>\n",
       "      <td>damion taylor</td>\n",
       "      <td>2013-07-21T20:20:34.118000</td>\n",
       "      <td>check out my new video</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>341</th>\n",
       "      <td>_2viQ_Qnc6_YN7xFNAg14zX99Y614Salf57yOcrBRSw</td>\n",
       "      <td>Shadrach Grentz</td>\n",
       "      <td>2013-07-21T12:21:37.898000</td>\n",
       "      <td>Hey Music Fans I really appreciate all of you ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>342</th>\n",
       "      <td>_2viQ_Qnc6_fyjM2m-ismUToowpNFauwtldKlfjbtIk</td>\n",
       "      <td>Joshua Kasey</td>\n",
       "      <td>2013-07-21T08:26:18.155000</td>\n",
       "      <td>Hello everyone, It Is not my intention to spam...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>343</th>\n",
       "      <td>_2viQ_Qnc6_AXZ4E5CeA3LHE36RAijd3QKgUI-YvjWI</td>\n",
       "      <td>ricky swaggz</td>\n",
       "      <td>2013-07-20T22:09:23.728000</td>\n",
       "      <td>******* Facebook is LAME and so 2004! Check ou...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>344</th>\n",
       "      <td>_2viQ_Qnc68ki9xsFeN2y1_ZiHYcZC8Qv1GyHfwqr7Y</td>\n",
       "      <td>steven reed</td>\n",
       "      <td>2013-07-19T22:12:16.609000</td>\n",
       "      <td>Please check out and send to others Freedom an...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>345</th>\n",
       "      <td>_2viQ_Qnc69mci30y5muwQXNMaeCmIvZ4ca8l_4zPmA</td>\n",
       "      <td>Johnny Rei Vlog</td>\n",
       "      <td>2013-07-19T11:41:54.923000</td>\n",
       "      <td>Nice to meet You - this is Johnny: 1. If You a...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>346</th>\n",
       "      <td>_2viQ_Qnc68cxHfQzecR1L9f-hbLNIJ7VxpCZtctPMk</td>\n",
       "      <td>yakikukamo FIRELOVER</td>\n",
       "      <td>2013-07-18T17:07:06.152000</td>\n",
       "      <td>hey you ! check out the channel of Alvar Lake !!</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>347</th>\n",
       "      <td>_2viQ_Qnc6-G71VMp3dR76dfQTcrRHpiNXJh2jm8V_M</td>\n",
       "      <td>Johnny Rei Vlog</td>\n",
       "      <td>2013-07-18T16:50:26.909000</td>\n",
       "      <td>Hi -this is Johnny: 1. If You already know my ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>348</th>\n",
       "      <td>_2viQ_Qnc6-3Nk200KmVtS-kiCS_1CjKJsMIbXakyfI</td>\n",
       "      <td>hsn moghrbi</td>\n",
       "      <td>2013-07-17T21:14:40.168000</td>\n",
       "      <td>wow</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>349</th>\n",
       "      <td>_2viQ_Qnc6-UzHByAP8y3BxG633jelEC_fxtFRUvLSA</td>\n",
       "      <td>Zuzanna Sztandera</td>\n",
       "      <td>2013-07-17T20:41:00.612000</td>\n",
       "      <td>Love this song!!!</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350</th>\n",
       "      <td>_2viQ_Qnc68LpP5gDCaWQuiywObesTUlRgSQExMVMac</td>\n",
       "      <td>Chelsea Fischer</td>\n",
       "      <td>2013-07-17T20:34:59.389000</td>\n",
       "      <td>Love this song !!!!!!</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351</th>\n",
       "      <td>z12xc3ly4x3uttmci22xff24nqqxwb0je04</td>\n",
       "      <td>Lisa Matthews</td>\n",
       "      <td>2013-07-17T13:56:03.233000</td>\n",
       "      <td>Check out this video on YouTube:&lt;br /&gt;&amp;quot;Th...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>352</th>\n",
       "      <td>_2viQ_Qnc6-PfOjDtTwbTalW_5TRtvBKMcHZdDrcI2o</td>\n",
       "      <td>laura elliott</td>\n",
       "      <td>2013-07-16T05:48:01.795000</td>\n",
       "      <td>i watched this because of the large amount of ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>353</th>\n",
       "      <td>_2viQ_Qnc69GH3FQl348HonbRxpbmtsR5CUei0zkJog</td>\n",
       "      <td>Riley Rollins</td>\n",
       "      <td>2013-07-16T00:30:46.660000</td>\n",
       "      <td>O peoples of the earth, I have seen how you pe...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>354</th>\n",
       "      <td>_2viQ_Qnc69S12dQyWLf0QBgUD29OMTe71geFOn4PJA</td>\n",
       "      <td>Oona Sarlotta</td>\n",
       "      <td>2013-07-15T16:08:50.204000</td>\n",
       "      <td>this song always gives me chills! :)</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>355</th>\n",
       "      <td>_2viQ_Qnc69Ic8yEMHsemUQiq01-kwwqnbDowMO9kdM</td>\n",
       "      <td>lynchee360</td>\n",
       "      <td>2013-07-14T22:38:26.779000</td>\n",
       "      <td>I love dis song!! 3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>356</th>\n",
       "      <td>_2viQ_Qnc6-qHJ_u9Yv84vj4yOAPLUL3ZibCc7b-vBI</td>\n",
       "      <td>FAHAD KHAN</td>\n",
       "      <td>2013-07-14T22:06:57.712000</td>\n",
       "      <td>I WILL NEVER FORGET THIS SONG IN MY LIFE LIKE ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>357</th>\n",
       "      <td>_2viQ_Qnc6_HU65mTzCmXnjA-WLt7XqxqPj7EwAtlO0</td>\n",
       "      <td>ricky swaggz</td>\n",
       "      <td>2013-07-14T20:40:00.331000</td>\n",
       "      <td>********OMG Facebook is OLD! Check out  ------...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>358</th>\n",
       "      <td>_2viQ_Qnc6-jk58CPwBnqfbM6oByJH5oPvCtKecLQyo</td>\n",
       "      <td>Shadrach Grentz</td>\n",
       "      <td>2013-07-14T03:11:20.243000</td>\n",
       "      <td>Hey Music Fans I really appreciate all of you ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>359</th>\n",
       "      <td>_2viQ_Qnc6_fgKR1W7-k1lbVURi8hVbMlQAMSOCSnyk</td>\n",
       "      <td>ThirdDegr3e</td>\n",
       "      <td>2013-07-13T20:48:22.967000</td>\n",
       "      <td>**CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>360</th>\n",
       "      <td>_2viQ_Qnc69MEEHHJxZ427KX8MlljJPnUC2YBbvbWwY</td>\n",
       "      <td>ThirdDegr3e</td>\n",
       "      <td>2013-07-13T20:48:06.033000</td>\n",
       "      <td>**CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>361</th>\n",
       "      <td>_2viQ_Qnc6_RKHVetk9kLzx8ZC62_J7y73FWFSBTe8Q</td>\n",
       "      <td>ThirdDegr3e</td>\n",
       "      <td>2013-07-13T20:47:40.793000</td>\n",
       "      <td>**CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>362</th>\n",
       "      <td>_2viQ_Qnc68TufyXKiTwky80ewSPbhRiD5XFHrJH9lg</td>\n",
       "      <td>Ysobel Schofield</td>\n",
       "      <td>2013-07-13T20:17:25.181000</td>\n",
       "      <td>Waka waka she rules</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>363</th>\n",
       "      <td>_2viQ_Qnc689m-WiwOwvrQU7LvkLAgspnfXL8ovE0ME</td>\n",
       "      <td>TheHotChocolate</td>\n",
       "      <td>2013-07-13T18:26:37.017000</td>\n",
       "      <td>she is sooooo beautiful!</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364</th>\n",
       "      <td>_2viQ_Qnc6_1Hq9MGlefkBIszt9rYD3S_CozADvMhQ4</td>\n",
       "      <td>Dinova Sharon</td>\n",
       "      <td>2013-07-13T14:44:00.700000</td>\n",
       "      <td>well done shakira</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>365</th>\n",
       "      <td>_2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA</td>\n",
       "      <td>Katie Mettam</td>\n",
       "      <td>2013-07-13T13:27:39.441000</td>\n",
       "      <td>I love this song because we sing it at Camp al...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>366</th>\n",
       "      <td>_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI</td>\n",
       "      <td>Sabina Pearson-Smith</td>\n",
       "      <td>2013-07-13T13:14:30.021000</td>\n",
       "      <td>I love this song for two reasons: 1.it is abou...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>367</th>\n",
       "      <td>_2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs</td>\n",
       "      <td>jeffrey jules</td>\n",
       "      <td>2013-07-13T12:09:31.188000</td>\n",
       "      <td>wow</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>368</th>\n",
       "      <td>_2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0</td>\n",
       "      <td>Aishlin Maciel</td>\n",
       "      <td>2013-07-13T11:17:52.308000</td>\n",
       "      <td>Shakira u are so wiredo</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>369</th>\n",
       "      <td>_2viQ_Qnc685RPw1aSa1tfrIuHXRvAQ2rPT9R06KTqA</td>\n",
       "      <td>Latin Bosch</td>\n",
       "      <td>2013-07-12T22:33:27.916000</td>\n",
       "      <td>Shakira is the best dancer</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1956 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      COMMENT_ID                AUTHOR  \\\n",
       "0    LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU             Julius NM   \n",
       "1    LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A           adam riyati   \n",
       "2    LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8      Evgeny Murashkin   \n",
       "3            z13jhp0bxqncu512g22wvzkasxmvvzjaz04       ElNino Melendez   \n",
       "4            z13fwbwp1oujthgqj04chlngpvzmtt3r3dw                GsMega   \n",
       "5    LZQPQhLyRh9-wNRtlZDM90f1k0BrdVdJyN_YsaSwfxc          Jason Haddad   \n",
       "6            z13lfzdo5vmdi1cm123te5uz2mqig1brz04        ferleck ferles   \n",
       "7          z122wfnzgt30fhubn04cdn3xfx2mxzngsl40k          Bob Kanowski   \n",
       "8            z13ttt1jcraqexk2o234ghbgzxymz1zzi04                  Cony   \n",
       "9            z12avveb4xqiirsix04chxviiljryduwxg0           BeBe Burkey   \n",
       "10         z13auhww3oufjn1qo04ci3grqqjmfjexxuo0k             Huckyduck   \n",
       "11             z13xit5agm2zyh4f523rst2gowmbx5bml           Lone Twistt   \n",
       "12             z13pejoiuozwxtdu323dspopnri4xts0f          Archie Lewis   \n",
       "13         z121zxaxsq25z5k5o04ch1o5jqqfij3gtm40k       TheUploadaddict   \n",
       "14         z12oglnpoq3gjh4om04cfdlbgp2uepyytpw0k        Francisco Nora   \n",
       "15           z13phrmwrkfisn5er22eyrbpbvaiwfvwf04  Gaming and Stuff PRO   \n",
       "16           z13bgdvyluihfv11i22rgxwhuvabzz1os04           Zielimeek21   \n",
       "17           z13vxpnoxsyeuv2jr04cctprprb1slnxdf4        OutrightIgnite   \n",
       "18         z12qth5j0ob1fx3q404chvy4fz32tbkpllk0k        Tony K Frazier   \n",
       "19           z13etj0bclzfztuwc04cgfvrgmf3fvjor1g         Jose Renteria   \n",
       "20           z12axnji5w2axxht522thb3bktvqjdlbp04          zhichao wang   \n",
       "21         z13ozdmr4lf3uzc5z04cix2zkyjzgvcyemw0k       Carlos Thegamer   \n",
       "22           z12ohdxjtsatvppjb04cctprprb1slnxdf4        OutrightIgnite   \n",
       "23         z12ntlcqht2bvjewi04cf1up0xjvs5lq3mc0k              Owen Lai   \n",
       "24   LZQPQhLyRh9EXArr4ZnVcDonSbvSMHKYOT24e_qR6fE           ||GuitarZ||   \n",
       "25   LZQPQhLyRh9y57URF7qpZRk3MVAJNLNhhZga_5YWBU8         Living4Techno   \n",
       "26   LZQPQhLyRh9vw01Xvvw5yWzZEUOPG1hSgRMHep55-Yw            8-BitMusic   \n",
       "27           z13kszcinpnvc34v2234fnpxkpmlw3nhc04            Kyle Jaber   \n",
       "28         z13tj514otzlurfbc04ccjwhrnmej1iihqw0k         Brandon Pryor   \n",
       "29         z13zvh1rmk3cf3mby04civbq5mjtddmbysk0k         Fun&amp;Hacks   \n",
       "..                                           ...                   ...   \n",
       "340  _2viQ_Qnc6-1oZCLsUWjl3-g4QrWxMQXsSTs2Hy4MGI         damion taylor   \n",
       "341  _2viQ_Qnc6_YN7xFNAg14zX99Y614Salf57yOcrBRSw       Shadrach Grentz   \n",
       "342  _2viQ_Qnc6_fyjM2m-ismUToowpNFauwtldKlfjbtIk          Joshua Kasey   \n",
       "343  _2viQ_Qnc6_AXZ4E5CeA3LHE36RAijd3QKgUI-YvjWI          ricky swaggz   \n",
       "344  _2viQ_Qnc68ki9xsFeN2y1_ZiHYcZC8Qv1GyHfwqr7Y           steven reed   \n",
       "345  _2viQ_Qnc69mci30y5muwQXNMaeCmIvZ4ca8l_4zPmA       Johnny Rei Vlog   \n",
       "346  _2viQ_Qnc68cxHfQzecR1L9f-hbLNIJ7VxpCZtctPMk  yakikukamo FIRELOVER   \n",
       "347  _2viQ_Qnc6-G71VMp3dR76dfQTcrRHpiNXJh2jm8V_M       Johnny Rei Vlog   \n",
       "348  _2viQ_Qnc6-3Nk200KmVtS-kiCS_1CjKJsMIbXakyfI           hsn moghrbi   \n",
       "349  _2viQ_Qnc6-UzHByAP8y3BxG633jelEC_fxtFRUvLSA     Zuzanna Sztandera   \n",
       "350  _2viQ_Qnc68LpP5gDCaWQuiywObesTUlRgSQExMVMac       Chelsea Fischer   \n",
       "351          z12xc3ly4x3uttmci22xff24nqqxwb0je04         Lisa Matthews   \n",
       "352  _2viQ_Qnc6-PfOjDtTwbTalW_5TRtvBKMcHZdDrcI2o         laura elliott   \n",
       "353  _2viQ_Qnc69GH3FQl348HonbRxpbmtsR5CUei0zkJog         Riley Rollins   \n",
       "354  _2viQ_Qnc69S12dQyWLf0QBgUD29OMTe71geFOn4PJA         Oona Sarlotta   \n",
       "355  _2viQ_Qnc69Ic8yEMHsemUQiq01-kwwqnbDowMO9kdM            lynchee360   \n",
       "356  _2viQ_Qnc6-qHJ_u9Yv84vj4yOAPLUL3ZibCc7b-vBI            FAHAD KHAN   \n",
       "357  _2viQ_Qnc6_HU65mTzCmXnjA-WLt7XqxqPj7EwAtlO0          ricky swaggz   \n",
       "358  _2viQ_Qnc6-jk58CPwBnqfbM6oByJH5oPvCtKecLQyo       Shadrach Grentz   \n",
       "359  _2viQ_Qnc6_fgKR1W7-k1lbVURi8hVbMlQAMSOCSnyk           ThirdDegr3e   \n",
       "360  _2viQ_Qnc69MEEHHJxZ427KX8MlljJPnUC2YBbvbWwY           ThirdDegr3e   \n",
       "361  _2viQ_Qnc6_RKHVetk9kLzx8ZC62_J7y73FWFSBTe8Q           ThirdDegr3e   \n",
       "362  _2viQ_Qnc68TufyXKiTwky80ewSPbhRiD5XFHrJH9lg      Ysobel Schofield   \n",
       "363  _2viQ_Qnc689m-WiwOwvrQU7LvkLAgspnfXL8ovE0ME       TheHotChocolate   \n",
       "364  _2viQ_Qnc6_1Hq9MGlefkBIszt9rYD3S_CozADvMhQ4         Dinova Sharon   \n",
       "365  _2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA          Katie Mettam   \n",
       "366  _2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI  Sabina Pearson-Smith   \n",
       "367  _2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs         jeffrey jules   \n",
       "368  _2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0        Aishlin Maciel   \n",
       "369  _2viQ_Qnc685RPw1aSa1tfrIuHXRvAQ2rPT9R06KTqA           Latin Bosch   \n",
       "\n",
       "                           DATE  \\\n",
       "0           2013-11-07T06:20:48   \n",
       "1           2013-11-07T12:37:15   \n",
       "2           2013-11-08T17:34:21   \n",
       "3           2013-11-09T08:28:43   \n",
       "4           2013-11-10T16:05:38   \n",
       "5           2013-11-26T02:55:11   \n",
       "6           2013-11-27T21:39:24   \n",
       "7           2013-11-28T12:33:27   \n",
       "8           2013-11-28T16:01:47   \n",
       "9           2013-11-28T16:30:13   \n",
       "10          2013-11-28T17:06:17   \n",
       "11          2013-11-28T17:34:55   \n",
       "12          2013-11-28T17:54:39   \n",
       "13          2013-11-28T18:12:12   \n",
       "14          2013-11-28T19:52:35   \n",
       "15          2013-11-28T21:14:13   \n",
       "16          2013-11-28T21:49:00   \n",
       "17          2013-11-28T21:55:02   \n",
       "18          2013-11-28T23:57:13   \n",
       "19          2013-11-29T00:22:01   \n",
       "20          2013-11-29T02:13:56   \n",
       "21          2013-12-01T01:20:21   \n",
       "22          2013-12-01T03:30:55   \n",
       "23          2013-12-01T04:51:52   \n",
       "24          2013-12-23T12:54:38   \n",
       "25          2013-12-25T19:46:26   \n",
       "26          2013-12-27T23:07:50   \n",
       "27          2014-01-19T00:21:29   \n",
       "28          2014-01-19T00:36:25   \n",
       "29          2014-01-19T00:42:35   \n",
       "..                          ...   \n",
       "340  2013-07-21T20:20:34.118000   \n",
       "341  2013-07-21T12:21:37.898000   \n",
       "342  2013-07-21T08:26:18.155000   \n",
       "343  2013-07-20T22:09:23.728000   \n",
       "344  2013-07-19T22:12:16.609000   \n",
       "345  2013-07-19T11:41:54.923000   \n",
       "346  2013-07-18T17:07:06.152000   \n",
       "347  2013-07-18T16:50:26.909000   \n",
       "348  2013-07-17T21:14:40.168000   \n",
       "349  2013-07-17T20:41:00.612000   \n",
       "350  2013-07-17T20:34:59.389000   \n",
       "351  2013-07-17T13:56:03.233000   \n",
       "352  2013-07-16T05:48:01.795000   \n",
       "353  2013-07-16T00:30:46.660000   \n",
       "354  2013-07-15T16:08:50.204000   \n",
       "355  2013-07-14T22:38:26.779000   \n",
       "356  2013-07-14T22:06:57.712000   \n",
       "357  2013-07-14T20:40:00.331000   \n",
       "358  2013-07-14T03:11:20.243000   \n",
       "359  2013-07-13T20:48:22.967000   \n",
       "360  2013-07-13T20:48:06.033000   \n",
       "361  2013-07-13T20:47:40.793000   \n",
       "362  2013-07-13T20:17:25.181000   \n",
       "363  2013-07-13T18:26:37.017000   \n",
       "364  2013-07-13T14:44:00.700000   \n",
       "365  2013-07-13T13:27:39.441000   \n",
       "366  2013-07-13T13:14:30.021000   \n",
       "367  2013-07-13T12:09:31.188000   \n",
       "368  2013-07-13T11:17:52.308000   \n",
       "369  2013-07-12T22:33:27.916000   \n",
       "\n",
       "                                               CONTENT  CLASS  \n",
       "0    Huh, anyway check out this you[tube] channel: ...      1  \n",
       "1    Hey guys check out my new channel and our firs...      1  \n",
       "2               just for test I have to say murdev.com      1  \n",
       "3     me shaking my sexy ass on my channel enjoy ^_^ ﻿      1  \n",
       "4              watch?v=vtaRGgvGtWQ   Check this out .﻿      1  \n",
       "5    Hey, check out my new website!! This site is a...      1  \n",
       "6                            Subscribe to my channel ﻿      1  \n",
       "7    i turned it on mute as soon is i came on i jus...      0  \n",
       "8      You should check my channel for Funny VIDEOS!!﻿      1  \n",
       "9    and u should.d check my channel and tell me wh...      1  \n",
       "10                                Hey subscribe to me﻿      1  \n",
       "11    Once you have started reading do not stop. If...      1  \n",
       "12                https://twitter.com/GBphotographyGB﻿      1  \n",
       "13                             subscribe like comment﻿      1  \n",
       "14   please like :D https://premium.easypromosapp.c...      1  \n",
       "15   Hello! Do you like gaming, art videos, scienti...      1  \n",
       "16                        I'm only checking the views﻿      0  \n",
       "17   http://www.ebay.com/itm/171183229277?ssPageNam...      1  \n",
       "18        http://ubuntuone.com/40beUutVu2ZKxK4uTgPZ8K﻿      1  \n",
       "19   We are an EDM apparel company dedicated to bri...      1  \n",
       "20   i think about 100 millions of the views come f...      0  \n",
       "21                  subscribe to my channel people :D﻿      1  \n",
       "22   Show your AUBURN PRIDE HERE: http://www.teespr...      1  \n",
       "23                            just checking the views﻿      0  \n",
       "24                                CHECK OUT MY CHANNEL      1  \n",
       "25   marketglory . com/strategygame/andrijamatf ear...      1  \n",
       "26   Hey guys! Im a 12 yr old music producer. I mak...      1  \n",
       "27             Check me out! I'm kyle. I rap so yeah ﻿      1  \n",
       "28   I dont even watch it anymore i just come here ...      0  \n",
       "29    Subscribe to me for free Android games, apps.. ﻿      1  \n",
       "..                                                 ...    ...  \n",
       "340                             check out my new video      1  \n",
       "341  Hey Music Fans I really appreciate all of you ...      1  \n",
       "342  Hello everyone, It Is not my intention to spam...      1  \n",
       "343  ******* Facebook is LAME and so 2004! Check ou...      1  \n",
       "344  Please check out and send to others Freedom an...      1  \n",
       "345  Nice to meet You - this is Johnny: 1. If You a...      1  \n",
       "346   hey you ! check out the channel of Alvar Lake !!      1  \n",
       "347  Hi -this is Johnny: 1. If You already know my ...      1  \n",
       "348                                                wow      0  \n",
       "349                                  Love this song!!!      0  \n",
       "350                              Love this song !!!!!!      0  \n",
       "351  Check out this video on YouTube:<br />&quot;Th...      1  \n",
       "352  i watched this because of the large amount of ...      0  \n",
       "353  O peoples of the earth, I have seen how you pe...      1  \n",
       "354               this song always gives me chills! :)      0  \n",
       "355                                I love dis song!! 3      0  \n",
       "356  I WILL NEVER FORGET THIS SONG IN MY LIFE LIKE ...      1  \n",
       "357  ********OMG Facebook is OLD! Check out  ------...      1  \n",
       "358  Hey Music Fans I really appreciate all of you ...      1  \n",
       "359  **CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...      1  \n",
       "360  **CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...      1  \n",
       "361  **CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...      1  \n",
       "362                                Waka waka she rules      0  \n",
       "363                           she is sooooo beautiful!      0  \n",
       "364                                  well done shakira      0  \n",
       "365  I love this song because we sing it at Camp al...      0  \n",
       "366  I love this song for two reasons: 1.it is abou...      0  \n",
       "367                                                wow      0  \n",
       "368                            Shakira u are so wiredo      0  \n",
       "369                         Shakira is the best dancer      0  \n",
       "\n",
       "[1956 rows x 5 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_merged"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1956, 5)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total Size\n",
    "df_merged.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merging with Keys\n",
    "keys = [\"Psy\",\"KatyPerry\",\"LMFAO\",\"Eminem\",\"Shakira\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_with_keys = pd.concat(frames,keys=keys)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>COMMENT_ID</th>\n",
       "      <th>AUTHOR</th>\n",
       "      <th>DATE</th>\n",
       "      <th>CONTENT</th>\n",
       "      <th>CLASS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"30\" valign=\"top\">Psy</th>\n",
       "      <th>0</th>\n",
       "      <td>LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU</td>\n",
       "      <td>Julius NM</td>\n",
       "      <td>2013-11-07T06:20:48</td>\n",
       "      <td>Huh, anyway check out this you[tube] channel: ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A</td>\n",
       "      <td>adam riyati</td>\n",
       "      <td>2013-11-07T12:37:15</td>\n",
       "      <td>Hey guys check out my new channel and our firs...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8</td>\n",
       "      <td>Evgeny Murashkin</td>\n",
       "      <td>2013-11-08T17:34:21</td>\n",
       "      <td>just for test I have to say murdev.com</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>z13jhp0bxqncu512g22wvzkasxmvvzjaz04</td>\n",
       "      <td>ElNino Melendez</td>\n",
       "      <td>2013-11-09T08:28:43</td>\n",
       "      <td>me shaking my sexy ass on my channel enjoy ^_^ ﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>z13fwbwp1oujthgqj04chlngpvzmtt3r3dw</td>\n",
       "      <td>GsMega</td>\n",
       "      <td>2013-11-10T16:05:38</td>\n",
       "      <td>watch?v=vtaRGgvGtWQ   Check this out .﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>LZQPQhLyRh9-wNRtlZDM90f1k0BrdVdJyN_YsaSwfxc</td>\n",
       "      <td>Jason Haddad</td>\n",
       "      <td>2013-11-26T02:55:11</td>\n",
       "      <td>Hey, check out my new website!! This site is a...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>z13lfzdo5vmdi1cm123te5uz2mqig1brz04</td>\n",
       "      <td>ferleck ferles</td>\n",
       "      <td>2013-11-27T21:39:24</td>\n",
       "      <td>Subscribe to my channel ﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>z122wfnzgt30fhubn04cdn3xfx2mxzngsl40k</td>\n",
       "      <td>Bob Kanowski</td>\n",
       "      <td>2013-11-28T12:33:27</td>\n",
       "      <td>i turned it on mute as soon is i came on i jus...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>z13ttt1jcraqexk2o234ghbgzxymz1zzi04</td>\n",
       "      <td>Cony</td>\n",
       "      <td>2013-11-28T16:01:47</td>\n",
       "      <td>You should check my channel for Funny VIDEOS!!﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>z12avveb4xqiirsix04chxviiljryduwxg0</td>\n",
       "      <td>BeBe Burkey</td>\n",
       "      <td>2013-11-28T16:30:13</td>\n",
       "      <td>and u should.d check my channel and tell me wh...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>z13auhww3oufjn1qo04ci3grqqjmfjexxuo0k</td>\n",
       "      <td>Huckyduck</td>\n",
       "      <td>2013-11-28T17:06:17</td>\n",
       "      <td>Hey subscribe to me﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>z13xit5agm2zyh4f523rst2gowmbx5bml</td>\n",
       "      <td>Lone Twistt</td>\n",
       "      <td>2013-11-28T17:34:55</td>\n",
       "      <td>Once you have started reading do not stop. If...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>z13pejoiuozwxtdu323dspopnri4xts0f</td>\n",
       "      <td>Archie Lewis</td>\n",
       "      <td>2013-11-28T17:54:39</td>\n",
       "      <td>https://twitter.com/GBphotographyGB﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>z121zxaxsq25z5k5o04ch1o5jqqfij3gtm40k</td>\n",
       "      <td>TheUploadaddict</td>\n",
       "      <td>2013-11-28T18:12:12</td>\n",
       "      <td>subscribe like comment﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>z12oglnpoq3gjh4om04cfdlbgp2uepyytpw0k</td>\n",
       "      <td>Francisco Nora</td>\n",
       "      <td>2013-11-28T19:52:35</td>\n",
       "      <td>please like :D https://premium.easypromosapp.c...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>z13phrmwrkfisn5er22eyrbpbvaiwfvwf04</td>\n",
       "      <td>Gaming and Stuff PRO</td>\n",
       "      <td>2013-11-28T21:14:13</td>\n",
       "      <td>Hello! Do you like gaming, art videos, scienti...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>z13bgdvyluihfv11i22rgxwhuvabzz1os04</td>\n",
       "      <td>Zielimeek21</td>\n",
       "      <td>2013-11-28T21:49:00</td>\n",
       "      <td>I'm only checking the views﻿</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>z13vxpnoxsyeuv2jr04cctprprb1slnxdf4</td>\n",
       "      <td>OutrightIgnite</td>\n",
       "      <td>2013-11-28T21:55:02</td>\n",
       "      <td>http://www.ebay.com/itm/171183229277?ssPageNam...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>z12qth5j0ob1fx3q404chvy4fz32tbkpllk0k</td>\n",
       "      <td>Tony K Frazier</td>\n",
       "      <td>2013-11-28T23:57:13</td>\n",
       "      <td>http://ubuntuone.com/40beUutVu2ZKxK4uTgPZ8K﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>z13etj0bclzfztuwc04cgfvrgmf3fvjor1g</td>\n",
       "      <td>Jose Renteria</td>\n",
       "      <td>2013-11-29T00:22:01</td>\n",
       "      <td>We are an EDM apparel company dedicated to bri...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>z12axnji5w2axxht522thb3bktvqjdlbp04</td>\n",
       "      <td>zhichao wang</td>\n",
       "      <td>2013-11-29T02:13:56</td>\n",
       "      <td>i think about 100 millions of the views come f...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>z13ozdmr4lf3uzc5z04cix2zkyjzgvcyemw0k</td>\n",
       "      <td>Carlos Thegamer</td>\n",
       "      <td>2013-12-01T01:20:21</td>\n",
       "      <td>subscribe to my channel people :D﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>z12ohdxjtsatvppjb04cctprprb1slnxdf4</td>\n",
       "      <td>OutrightIgnite</td>\n",
       "      <td>2013-12-01T03:30:55</td>\n",
       "      <td>Show your AUBURN PRIDE HERE: http://www.teespr...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>z12ntlcqht2bvjewi04cf1up0xjvs5lq3mc0k</td>\n",
       "      <td>Owen Lai</td>\n",
       "      <td>2013-12-01T04:51:52</td>\n",
       "      <td>just checking the views﻿</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>LZQPQhLyRh9EXArr4ZnVcDonSbvSMHKYOT24e_qR6fE</td>\n",
       "      <td>||GuitarZ||</td>\n",
       "      <td>2013-12-23T12:54:38</td>\n",
       "      <td>CHECK OUT MY CHANNEL</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>LZQPQhLyRh9y57URF7qpZRk3MVAJNLNhhZga_5YWBU8</td>\n",
       "      <td>Living4Techno</td>\n",
       "      <td>2013-12-25T19:46:26</td>\n",
       "      <td>marketglory . com/strategygame/andrijamatf ear...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>LZQPQhLyRh9vw01Xvvw5yWzZEUOPG1hSgRMHep55-Yw</td>\n",
       "      <td>8-BitMusic</td>\n",
       "      <td>2013-12-27T23:07:50</td>\n",
       "      <td>Hey guys! Im a 12 yr old music producer. I mak...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>z13kszcinpnvc34v2234fnpxkpmlw3nhc04</td>\n",
       "      <td>Kyle Jaber</td>\n",
       "      <td>2014-01-19T00:21:29</td>\n",
       "      <td>Check me out! I'm kyle. I rap so yeah ﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>z13tj514otzlurfbc04ccjwhrnmej1iihqw0k</td>\n",
       "      <td>Brandon Pryor</td>\n",
       "      <td>2014-01-19T00:36:25</td>\n",
       "      <td>I dont even watch it anymore i just come here ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>z13zvh1rmk3cf3mby04civbq5mjtddmbysk0k</td>\n",
       "      <td>Fun&amp;amp;Hacks</td>\n",
       "      <td>2014-01-19T00:42:35</td>\n",
       "      <td>Subscribe to me for free Android games, apps.. ﻿</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"30\" valign=\"top\">Shakira</th>\n",
       "      <th>340</th>\n",
       "      <td>_2viQ_Qnc6-1oZCLsUWjl3-g4QrWxMQXsSTs2Hy4MGI</td>\n",
       "      <td>damion taylor</td>\n",
       "      <td>2013-07-21T20:20:34.118000</td>\n",
       "      <td>check out my new video</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>341</th>\n",
       "      <td>_2viQ_Qnc6_YN7xFNAg14zX99Y614Salf57yOcrBRSw</td>\n",
       "      <td>Shadrach Grentz</td>\n",
       "      <td>2013-07-21T12:21:37.898000</td>\n",
       "      <td>Hey Music Fans I really appreciate all of you ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>342</th>\n",
       "      <td>_2viQ_Qnc6_fyjM2m-ismUToowpNFauwtldKlfjbtIk</td>\n",
       "      <td>Joshua Kasey</td>\n",
       "      <td>2013-07-21T08:26:18.155000</td>\n",
       "      <td>Hello everyone, It Is not my intention to spam...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>343</th>\n",
       "      <td>_2viQ_Qnc6_AXZ4E5CeA3LHE36RAijd3QKgUI-YvjWI</td>\n",
       "      <td>ricky swaggz</td>\n",
       "      <td>2013-07-20T22:09:23.728000</td>\n",
       "      <td>******* Facebook is LAME and so 2004! Check ou...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>344</th>\n",
       "      <td>_2viQ_Qnc68ki9xsFeN2y1_ZiHYcZC8Qv1GyHfwqr7Y</td>\n",
       "      <td>steven reed</td>\n",
       "      <td>2013-07-19T22:12:16.609000</td>\n",
       "      <td>Please check out and send to others Freedom an...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>345</th>\n",
       "      <td>_2viQ_Qnc69mci30y5muwQXNMaeCmIvZ4ca8l_4zPmA</td>\n",
       "      <td>Johnny Rei Vlog</td>\n",
       "      <td>2013-07-19T11:41:54.923000</td>\n",
       "      <td>Nice to meet You - this is Johnny: 1. If You a...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>346</th>\n",
       "      <td>_2viQ_Qnc68cxHfQzecR1L9f-hbLNIJ7VxpCZtctPMk</td>\n",
       "      <td>yakikukamo FIRELOVER</td>\n",
       "      <td>2013-07-18T17:07:06.152000</td>\n",
       "      <td>hey you ! check out the channel of Alvar Lake !!</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>347</th>\n",
       "      <td>_2viQ_Qnc6-G71VMp3dR76dfQTcrRHpiNXJh2jm8V_M</td>\n",
       "      <td>Johnny Rei Vlog</td>\n",
       "      <td>2013-07-18T16:50:26.909000</td>\n",
       "      <td>Hi -this is Johnny: 1. If You already know my ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>348</th>\n",
       "      <td>_2viQ_Qnc6-3Nk200KmVtS-kiCS_1CjKJsMIbXakyfI</td>\n",
       "      <td>hsn moghrbi</td>\n",
       "      <td>2013-07-17T21:14:40.168000</td>\n",
       "      <td>wow</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>349</th>\n",
       "      <td>_2viQ_Qnc6-UzHByAP8y3BxG633jelEC_fxtFRUvLSA</td>\n",
       "      <td>Zuzanna Sztandera</td>\n",
       "      <td>2013-07-17T20:41:00.612000</td>\n",
       "      <td>Love this song!!!</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350</th>\n",
       "      <td>_2viQ_Qnc68LpP5gDCaWQuiywObesTUlRgSQExMVMac</td>\n",
       "      <td>Chelsea Fischer</td>\n",
       "      <td>2013-07-17T20:34:59.389000</td>\n",
       "      <td>Love this song !!!!!!</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>351</th>\n",
       "      <td>z12xc3ly4x3uttmci22xff24nqqxwb0je04</td>\n",
       "      <td>Lisa Matthews</td>\n",
       "      <td>2013-07-17T13:56:03.233000</td>\n",
       "      <td>Check out this video on YouTube:&lt;br /&gt;&amp;quot;Th...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>352</th>\n",
       "      <td>_2viQ_Qnc6-PfOjDtTwbTalW_5TRtvBKMcHZdDrcI2o</td>\n",
       "      <td>laura elliott</td>\n",
       "      <td>2013-07-16T05:48:01.795000</td>\n",
       "      <td>i watched this because of the large amount of ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>353</th>\n",
       "      <td>_2viQ_Qnc69GH3FQl348HonbRxpbmtsR5CUei0zkJog</td>\n",
       "      <td>Riley Rollins</td>\n",
       "      <td>2013-07-16T00:30:46.660000</td>\n",
       "      <td>O peoples of the earth, I have seen how you pe...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>354</th>\n",
       "      <td>_2viQ_Qnc69S12dQyWLf0QBgUD29OMTe71geFOn4PJA</td>\n",
       "      <td>Oona Sarlotta</td>\n",
       "      <td>2013-07-15T16:08:50.204000</td>\n",
       "      <td>this song always gives me chills! :)</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>355</th>\n",
       "      <td>_2viQ_Qnc69Ic8yEMHsemUQiq01-kwwqnbDowMO9kdM</td>\n",
       "      <td>lynchee360</td>\n",
       "      <td>2013-07-14T22:38:26.779000</td>\n",
       "      <td>I love dis song!! 3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>356</th>\n",
       "      <td>_2viQ_Qnc6-qHJ_u9Yv84vj4yOAPLUL3ZibCc7b-vBI</td>\n",
       "      <td>FAHAD KHAN</td>\n",
       "      <td>2013-07-14T22:06:57.712000</td>\n",
       "      <td>I WILL NEVER FORGET THIS SONG IN MY LIFE LIKE ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>357</th>\n",
       "      <td>_2viQ_Qnc6_HU65mTzCmXnjA-WLt7XqxqPj7EwAtlO0</td>\n",
       "      <td>ricky swaggz</td>\n",
       "      <td>2013-07-14T20:40:00.331000</td>\n",
       "      <td>********OMG Facebook is OLD! Check out  ------...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>358</th>\n",
       "      <td>_2viQ_Qnc6-jk58CPwBnqfbM6oByJH5oPvCtKecLQyo</td>\n",
       "      <td>Shadrach Grentz</td>\n",
       "      <td>2013-07-14T03:11:20.243000</td>\n",
       "      <td>Hey Music Fans I really appreciate all of you ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>359</th>\n",
       "      <td>_2viQ_Qnc6_fgKR1W7-k1lbVURi8hVbMlQAMSOCSnyk</td>\n",
       "      <td>ThirdDegr3e</td>\n",
       "      <td>2013-07-13T20:48:22.967000</td>\n",
       "      <td>**CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>360</th>\n",
       "      <td>_2viQ_Qnc69MEEHHJxZ427KX8MlljJPnUC2YBbvbWwY</td>\n",
       "      <td>ThirdDegr3e</td>\n",
       "      <td>2013-07-13T20:48:06.033000</td>\n",
       "      <td>**CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>361</th>\n",
       "      <td>_2viQ_Qnc6_RKHVetk9kLzx8ZC62_J7y73FWFSBTe8Q</td>\n",
       "      <td>ThirdDegr3e</td>\n",
       "      <td>2013-07-13T20:47:40.793000</td>\n",
       "      <td>**CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>362</th>\n",
       "      <td>_2viQ_Qnc68TufyXKiTwky80ewSPbhRiD5XFHrJH9lg</td>\n",
       "      <td>Ysobel Schofield</td>\n",
       "      <td>2013-07-13T20:17:25.181000</td>\n",
       "      <td>Waka waka she rules</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>363</th>\n",
       "      <td>_2viQ_Qnc689m-WiwOwvrQU7LvkLAgspnfXL8ovE0ME</td>\n",
       "      <td>TheHotChocolate</td>\n",
       "      <td>2013-07-13T18:26:37.017000</td>\n",
       "      <td>she is sooooo beautiful!</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364</th>\n",
       "      <td>_2viQ_Qnc6_1Hq9MGlefkBIszt9rYD3S_CozADvMhQ4</td>\n",
       "      <td>Dinova Sharon</td>\n",
       "      <td>2013-07-13T14:44:00.700000</td>\n",
       "      <td>well done shakira</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>365</th>\n",
       "      <td>_2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA</td>\n",
       "      <td>Katie Mettam</td>\n",
       "      <td>2013-07-13T13:27:39.441000</td>\n",
       "      <td>I love this song because we sing it at Camp al...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>366</th>\n",
       "      <td>_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI</td>\n",
       "      <td>Sabina Pearson-Smith</td>\n",
       "      <td>2013-07-13T13:14:30.021000</td>\n",
       "      <td>I love this song for two reasons: 1.it is abou...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>367</th>\n",
       "      <td>_2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs</td>\n",
       "      <td>jeffrey jules</td>\n",
       "      <td>2013-07-13T12:09:31.188000</td>\n",
       "      <td>wow</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>368</th>\n",
       "      <td>_2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0</td>\n",
       "      <td>Aishlin Maciel</td>\n",
       "      <td>2013-07-13T11:17:52.308000</td>\n",
       "      <td>Shakira u are so wiredo</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>369</th>\n",
       "      <td>_2viQ_Qnc685RPw1aSa1tfrIuHXRvAQ2rPT9R06KTqA</td>\n",
       "      <td>Latin Bosch</td>\n",
       "      <td>2013-07-12T22:33:27.916000</td>\n",
       "      <td>Shakira is the best dancer</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1956 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              COMMENT_ID  \\\n",
       "Psy     0    LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU   \n",
       "        1    LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A   \n",
       "        2    LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8   \n",
       "        3            z13jhp0bxqncu512g22wvzkasxmvvzjaz04   \n",
       "        4            z13fwbwp1oujthgqj04chlngpvzmtt3r3dw   \n",
       "        5    LZQPQhLyRh9-wNRtlZDM90f1k0BrdVdJyN_YsaSwfxc   \n",
       "        6            z13lfzdo5vmdi1cm123te5uz2mqig1brz04   \n",
       "        7          z122wfnzgt30fhubn04cdn3xfx2mxzngsl40k   \n",
       "        8            z13ttt1jcraqexk2o234ghbgzxymz1zzi04   \n",
       "        9            z12avveb4xqiirsix04chxviiljryduwxg0   \n",
       "        10         z13auhww3oufjn1qo04ci3grqqjmfjexxuo0k   \n",
       "        11             z13xit5agm2zyh4f523rst2gowmbx5bml   \n",
       "        12             z13pejoiuozwxtdu323dspopnri4xts0f   \n",
       "        13         z121zxaxsq25z5k5o04ch1o5jqqfij3gtm40k   \n",
       "        14         z12oglnpoq3gjh4om04cfdlbgp2uepyytpw0k   \n",
       "        15           z13phrmwrkfisn5er22eyrbpbvaiwfvwf04   \n",
       "        16           z13bgdvyluihfv11i22rgxwhuvabzz1os04   \n",
       "        17           z13vxpnoxsyeuv2jr04cctprprb1slnxdf4   \n",
       "        18         z12qth5j0ob1fx3q404chvy4fz32tbkpllk0k   \n",
       "        19           z13etj0bclzfztuwc04cgfvrgmf3fvjor1g   \n",
       "        20           z12axnji5w2axxht522thb3bktvqjdlbp04   \n",
       "        21         z13ozdmr4lf3uzc5z04cix2zkyjzgvcyemw0k   \n",
       "        22           z12ohdxjtsatvppjb04cctprprb1slnxdf4   \n",
       "        23         z12ntlcqht2bvjewi04cf1up0xjvs5lq3mc0k   \n",
       "        24   LZQPQhLyRh9EXArr4ZnVcDonSbvSMHKYOT24e_qR6fE   \n",
       "        25   LZQPQhLyRh9y57URF7qpZRk3MVAJNLNhhZga_5YWBU8   \n",
       "        26   LZQPQhLyRh9vw01Xvvw5yWzZEUOPG1hSgRMHep55-Yw   \n",
       "        27           z13kszcinpnvc34v2234fnpxkpmlw3nhc04   \n",
       "        28         z13tj514otzlurfbc04ccjwhrnmej1iihqw0k   \n",
       "        29         z13zvh1rmk3cf3mby04civbq5mjtddmbysk0k   \n",
       "...                                                  ...   \n",
       "Shakira 340  _2viQ_Qnc6-1oZCLsUWjl3-g4QrWxMQXsSTs2Hy4MGI   \n",
       "        341  _2viQ_Qnc6_YN7xFNAg14zX99Y614Salf57yOcrBRSw   \n",
       "        342  _2viQ_Qnc6_fyjM2m-ismUToowpNFauwtldKlfjbtIk   \n",
       "        343  _2viQ_Qnc6_AXZ4E5CeA3LHE36RAijd3QKgUI-YvjWI   \n",
       "        344  _2viQ_Qnc68ki9xsFeN2y1_ZiHYcZC8Qv1GyHfwqr7Y   \n",
       "        345  _2viQ_Qnc69mci30y5muwQXNMaeCmIvZ4ca8l_4zPmA   \n",
       "        346  _2viQ_Qnc68cxHfQzecR1L9f-hbLNIJ7VxpCZtctPMk   \n",
       "        347  _2viQ_Qnc6-G71VMp3dR76dfQTcrRHpiNXJh2jm8V_M   \n",
       "        348  _2viQ_Qnc6-3Nk200KmVtS-kiCS_1CjKJsMIbXakyfI   \n",
       "        349  _2viQ_Qnc6-UzHByAP8y3BxG633jelEC_fxtFRUvLSA   \n",
       "        350  _2viQ_Qnc68LpP5gDCaWQuiywObesTUlRgSQExMVMac   \n",
       "        351          z12xc3ly4x3uttmci22xff24nqqxwb0je04   \n",
       "        352  _2viQ_Qnc6-PfOjDtTwbTalW_5TRtvBKMcHZdDrcI2o   \n",
       "        353  _2viQ_Qnc69GH3FQl348HonbRxpbmtsR5CUei0zkJog   \n",
       "        354  _2viQ_Qnc69S12dQyWLf0QBgUD29OMTe71geFOn4PJA   \n",
       "        355  _2viQ_Qnc69Ic8yEMHsemUQiq01-kwwqnbDowMO9kdM   \n",
       "        356  _2viQ_Qnc6-qHJ_u9Yv84vj4yOAPLUL3ZibCc7b-vBI   \n",
       "        357  _2viQ_Qnc6_HU65mTzCmXnjA-WLt7XqxqPj7EwAtlO0   \n",
       "        358  _2viQ_Qnc6-jk58CPwBnqfbM6oByJH5oPvCtKecLQyo   \n",
       "        359  _2viQ_Qnc6_fgKR1W7-k1lbVURi8hVbMlQAMSOCSnyk   \n",
       "        360  _2viQ_Qnc69MEEHHJxZ427KX8MlljJPnUC2YBbvbWwY   \n",
       "        361  _2viQ_Qnc6_RKHVetk9kLzx8ZC62_J7y73FWFSBTe8Q   \n",
       "        362  _2viQ_Qnc68TufyXKiTwky80ewSPbhRiD5XFHrJH9lg   \n",
       "        363  _2viQ_Qnc689m-WiwOwvrQU7LvkLAgspnfXL8ovE0ME   \n",
       "        364  _2viQ_Qnc6_1Hq9MGlefkBIszt9rYD3S_CozADvMhQ4   \n",
       "        365  _2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA   \n",
       "        366  _2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI   \n",
       "        367  _2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs   \n",
       "        368  _2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0   \n",
       "        369  _2viQ_Qnc685RPw1aSa1tfrIuHXRvAQ2rPT9R06KTqA   \n",
       "\n",
       "                           AUTHOR                        DATE  \\\n",
       "Psy     0               Julius NM         2013-11-07T06:20:48   \n",
       "        1             adam riyati         2013-11-07T12:37:15   \n",
       "        2        Evgeny Murashkin         2013-11-08T17:34:21   \n",
       "        3         ElNino Melendez         2013-11-09T08:28:43   \n",
       "        4                  GsMega         2013-11-10T16:05:38   \n",
       "        5            Jason Haddad         2013-11-26T02:55:11   \n",
       "        6          ferleck ferles         2013-11-27T21:39:24   \n",
       "        7            Bob Kanowski         2013-11-28T12:33:27   \n",
       "        8                    Cony         2013-11-28T16:01:47   \n",
       "        9             BeBe Burkey         2013-11-28T16:30:13   \n",
       "        10              Huckyduck         2013-11-28T17:06:17   \n",
       "        11            Lone Twistt         2013-11-28T17:34:55   \n",
       "        12           Archie Lewis         2013-11-28T17:54:39   \n",
       "        13        TheUploadaddict         2013-11-28T18:12:12   \n",
       "        14         Francisco Nora         2013-11-28T19:52:35   \n",
       "        15   Gaming and Stuff PRO         2013-11-28T21:14:13   \n",
       "        16            Zielimeek21         2013-11-28T21:49:00   \n",
       "        17         OutrightIgnite         2013-11-28T21:55:02   \n",
       "        18         Tony K Frazier         2013-11-28T23:57:13   \n",
       "        19          Jose Renteria         2013-11-29T00:22:01   \n",
       "        20           zhichao wang         2013-11-29T02:13:56   \n",
       "        21        Carlos Thegamer         2013-12-01T01:20:21   \n",
       "        22         OutrightIgnite         2013-12-01T03:30:55   \n",
       "        23               Owen Lai         2013-12-01T04:51:52   \n",
       "        24            ||GuitarZ||         2013-12-23T12:54:38   \n",
       "        25          Living4Techno         2013-12-25T19:46:26   \n",
       "        26             8-BitMusic         2013-12-27T23:07:50   \n",
       "        27             Kyle Jaber         2014-01-19T00:21:29   \n",
       "        28          Brandon Pryor         2014-01-19T00:36:25   \n",
       "        29          Fun&amp;Hacks         2014-01-19T00:42:35   \n",
       "...                           ...                         ...   \n",
       "Shakira 340         damion taylor  2013-07-21T20:20:34.118000   \n",
       "        341       Shadrach Grentz  2013-07-21T12:21:37.898000   \n",
       "        342          Joshua Kasey  2013-07-21T08:26:18.155000   \n",
       "        343          ricky swaggz  2013-07-20T22:09:23.728000   \n",
       "        344           steven reed  2013-07-19T22:12:16.609000   \n",
       "        345       Johnny Rei Vlog  2013-07-19T11:41:54.923000   \n",
       "        346  yakikukamo FIRELOVER  2013-07-18T17:07:06.152000   \n",
       "        347       Johnny Rei Vlog  2013-07-18T16:50:26.909000   \n",
       "        348           hsn moghrbi  2013-07-17T21:14:40.168000   \n",
       "        349     Zuzanna Sztandera  2013-07-17T20:41:00.612000   \n",
       "        350       Chelsea Fischer  2013-07-17T20:34:59.389000   \n",
       "        351         Lisa Matthews  2013-07-17T13:56:03.233000   \n",
       "        352         laura elliott  2013-07-16T05:48:01.795000   \n",
       "        353         Riley Rollins  2013-07-16T00:30:46.660000   \n",
       "        354         Oona Sarlotta  2013-07-15T16:08:50.204000   \n",
       "        355            lynchee360  2013-07-14T22:38:26.779000   \n",
       "        356            FAHAD KHAN  2013-07-14T22:06:57.712000   \n",
       "        357          ricky swaggz  2013-07-14T20:40:00.331000   \n",
       "        358       Shadrach Grentz  2013-07-14T03:11:20.243000   \n",
       "        359           ThirdDegr3e  2013-07-13T20:48:22.967000   \n",
       "        360           ThirdDegr3e  2013-07-13T20:48:06.033000   \n",
       "        361           ThirdDegr3e  2013-07-13T20:47:40.793000   \n",
       "        362      Ysobel Schofield  2013-07-13T20:17:25.181000   \n",
       "        363       TheHotChocolate  2013-07-13T18:26:37.017000   \n",
       "        364         Dinova Sharon  2013-07-13T14:44:00.700000   \n",
       "        365          Katie Mettam  2013-07-13T13:27:39.441000   \n",
       "        366  Sabina Pearson-Smith  2013-07-13T13:14:30.021000   \n",
       "        367         jeffrey jules  2013-07-13T12:09:31.188000   \n",
       "        368        Aishlin Maciel  2013-07-13T11:17:52.308000   \n",
       "        369           Latin Bosch  2013-07-12T22:33:27.916000   \n",
       "\n",
       "                                                       CONTENT  CLASS  \n",
       "Psy     0    Huh, anyway check out this you[tube] channel: ...      1  \n",
       "        1    Hey guys check out my new channel and our firs...      1  \n",
       "        2               just for test I have to say murdev.com      1  \n",
       "        3     me shaking my sexy ass on my channel enjoy ^_^ ﻿      1  \n",
       "        4              watch?v=vtaRGgvGtWQ   Check this out .﻿      1  \n",
       "        5    Hey, check out my new website!! This site is a...      1  \n",
       "        6                            Subscribe to my channel ﻿      1  \n",
       "        7    i turned it on mute as soon is i came on i jus...      0  \n",
       "        8      You should check my channel for Funny VIDEOS!!﻿      1  \n",
       "        9    and u should.d check my channel and tell me wh...      1  \n",
       "        10                                Hey subscribe to me﻿      1  \n",
       "        11    Once you have started reading do not stop. If...      1  \n",
       "        12                https://twitter.com/GBphotographyGB﻿      1  \n",
       "        13                             subscribe like comment﻿      1  \n",
       "        14   please like :D https://premium.easypromosapp.c...      1  \n",
       "        15   Hello! Do you like gaming, art videos, scienti...      1  \n",
       "        16                        I'm only checking the views﻿      0  \n",
       "        17   http://www.ebay.com/itm/171183229277?ssPageNam...      1  \n",
       "        18        http://ubuntuone.com/40beUutVu2ZKxK4uTgPZ8K﻿      1  \n",
       "        19   We are an EDM apparel company dedicated to bri...      1  \n",
       "        20   i think about 100 millions of the views come f...      0  \n",
       "        21                  subscribe to my channel people :D﻿      1  \n",
       "        22   Show your AUBURN PRIDE HERE: http://www.teespr...      1  \n",
       "        23                            just checking the views﻿      0  \n",
       "        24                                CHECK OUT MY CHANNEL      1  \n",
       "        25   marketglory . com/strategygame/andrijamatf ear...      1  \n",
       "        26   Hey guys! Im a 12 yr old music producer. I mak...      1  \n",
       "        27             Check me out! I'm kyle. I rap so yeah ﻿      1  \n",
       "        28   I dont even watch it anymore i just come here ...      0  \n",
       "        29    Subscribe to me for free Android games, apps.. ﻿      1  \n",
       "...                                                        ...    ...  \n",
       "Shakira 340                             check out my new video      1  \n",
       "        341  Hey Music Fans I really appreciate all of you ...      1  \n",
       "        342  Hello everyone, It Is not my intention to spam...      1  \n",
       "        343  ******* Facebook is LAME and so 2004! Check ou...      1  \n",
       "        344  Please check out and send to others Freedom an...      1  \n",
       "        345  Nice to meet You - this is Johnny: 1. If You a...      1  \n",
       "        346   hey you ! check out the channel of Alvar Lake !!      1  \n",
       "        347  Hi -this is Johnny: 1. If You already know my ...      1  \n",
       "        348                                                wow      0  \n",
       "        349                                  Love this song!!!      0  \n",
       "        350                              Love this song !!!!!!      0  \n",
       "        351  Check out this video on YouTube:<br />&quot;Th...      1  \n",
       "        352  i watched this because of the large amount of ...      0  \n",
       "        353  O peoples of the earth, I have seen how you pe...      1  \n",
       "        354               this song always gives me chills! :)      0  \n",
       "        355                                I love dis song!! 3      0  \n",
       "        356  I WILL NEVER FORGET THIS SONG IN MY LIFE LIKE ...      1  \n",
       "        357  ********OMG Facebook is OLD! Check out  ------...      1  \n",
       "        358  Hey Music Fans I really appreciate all of you ...      1  \n",
       "        359  **CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...      1  \n",
       "        360  **CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...      1  \n",
       "        361  **CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...      1  \n",
       "        362                                Waka waka she rules      0  \n",
       "        363                           she is sooooo beautiful!      0  \n",
       "        364                                  well done shakira      0  \n",
       "        365  I love this song because we sing it at Camp al...      0  \n",
       "        366  I love this song for two reasons: 1.it is abou...      0  \n",
       "        367                                                wow      0  \n",
       "        368                            Shakira u are so wiredo      0  \n",
       "        369                         Shakira is the best dancer      0  \n",
       "\n",
       "[1956 rows x 5 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_with_keys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Checking for Only Comments on Shakira\n",
    "df_with_keys.loc['Shakira']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save and Write Merged Data to csv\n",
    "df_with_keys.to_csv(\"YoutubeSpamMergedData.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df_with_keys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9780"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Data Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS'], dtype='object')"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Checking for Consistent Column Name\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "COMMENT_ID    object\n",
       "AUTHOR        object\n",
       "DATE          object\n",
       "CONTENT       object\n",
       "CLASS          int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Checking for Datatypes\n",
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "COMMENT_ID    0\n",
       "AUTHOR        0\n",
       "DATE          0\n",
       "CONTENT       0\n",
       "CLASS         0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check for missing nan\n",
    "df.isnull().isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Psy      0             2013-11-07T06:20:48\n",
       "         1             2013-11-07T12:37:15\n",
       "         2             2013-11-08T17:34:21\n",
       "         3             2013-11-09T08:28:43\n",
       "         4             2013-11-10T16:05:38\n",
       "         5             2013-11-26T02:55:11\n",
       "         6             2013-11-27T21:39:24\n",
       "         7             2013-11-28T12:33:27\n",
       "         8             2013-11-28T16:01:47\n",
       "         9             2013-11-28T16:30:13\n",
       "         10            2013-11-28T17:06:17\n",
       "         11            2013-11-28T17:34:55\n",
       "         12            2013-11-28T17:54:39\n",
       "         13            2013-11-28T18:12:12\n",
       "         14            2013-11-28T19:52:35\n",
       "         15            2013-11-28T21:14:13\n",
       "         16            2013-11-28T21:49:00\n",
       "         17            2013-11-28T21:55:02\n",
       "         18            2013-11-28T23:57:13\n",
       "         19            2013-11-29T00:22:01\n",
       "         20            2013-11-29T02:13:56\n",
       "         21            2013-12-01T01:20:21\n",
       "         22            2013-12-01T03:30:55\n",
       "         23            2013-12-01T04:51:52\n",
       "         24            2013-12-23T12:54:38\n",
       "         25            2013-12-25T19:46:26\n",
       "         26            2013-12-27T23:07:50\n",
       "         27            2014-01-19T00:21:29\n",
       "         28            2014-01-19T00:36:25\n",
       "         29            2014-01-19T00:42:35\n",
       "                           ...            \n",
       "Shakira  340    2013-07-21T20:20:34.118000\n",
       "         341    2013-07-21T12:21:37.898000\n",
       "         342    2013-07-21T08:26:18.155000\n",
       "         343    2013-07-20T22:09:23.728000\n",
       "         344    2013-07-19T22:12:16.609000\n",
       "         345    2013-07-19T11:41:54.923000\n",
       "         346    2013-07-18T17:07:06.152000\n",
       "         347    2013-07-18T16:50:26.909000\n",
       "         348    2013-07-17T21:14:40.168000\n",
       "         349    2013-07-17T20:41:00.612000\n",
       "         350    2013-07-17T20:34:59.389000\n",
       "         351    2013-07-17T13:56:03.233000\n",
       "         352    2013-07-16T05:48:01.795000\n",
       "         353    2013-07-16T00:30:46.660000\n",
       "         354    2013-07-15T16:08:50.204000\n",
       "         355    2013-07-14T22:38:26.779000\n",
       "         356    2013-07-14T22:06:57.712000\n",
       "         357    2013-07-14T20:40:00.331000\n",
       "         358    2013-07-14T03:11:20.243000\n",
       "         359    2013-07-13T20:48:22.967000\n",
       "         360    2013-07-13T20:48:06.033000\n",
       "         361    2013-07-13T20:47:40.793000\n",
       "         362    2013-07-13T20:17:25.181000\n",
       "         363    2013-07-13T18:26:37.017000\n",
       "         364    2013-07-13T14:44:00.700000\n",
       "         365    2013-07-13T13:27:39.441000\n",
       "         366    2013-07-13T13:14:30.021000\n",
       "         367    2013-07-13T12:09:31.188000\n",
       "         368    2013-07-13T11:17:52.308000\n",
       "         369    2013-07-12T22:33:27.916000\n",
       "Name: DATE, Length: 1956, dtype: object"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Checking for Date\n",
    "df[\"DATE\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.AUTHOR\n",
    "# Convert the Author Name to First Name and Last Name\n",
    "#df[[\"FIRSTNAME\",\"LASTNAME\"]] = df['AUTHOR'].str.split(expand=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Working With Text Content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_data = df[[\"CONTENT\",\"CLASS\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['CONTENT', 'CLASS'], dtype='object')"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_x = df_data['CONTENT']\n",
    "df_y = df_data['CLASS']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Feature Extraction From Text\n",
    "+ CountVectorizer\n",
    "+ TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv = CountVectorizer()\n",
    "ex = cv.fit_transform([\"Great song but check this out\",\"What is this song?\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 1, 1, 0, 1, 1, 1, 0],\n",
       "       [0, 0, 0, 1, 0, 1, 1, 1]], dtype=int64)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example of What CountVectorizer Does\n",
    "ex.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['but', 'check', 'great', 'is', 'out', 'song', 'this', 'what']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv.get_feature_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract Feature With CountVectorizer\n",
    "corpus = df_x\n",
    "cv = CountVectorizer()\n",
    "X = cv.fit_transform(corpus) # Fit the Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['00',\n",
       " '000',\n",
       " '002',\n",
       " '018',\n",
       " '02',\n",
       " '034',\n",
       " '04',\n",
       " '047000',\n",
       " '05',\n",
       " '053012',\n",
       " '0687119038',\n",
       " '08',\n",
       " '09',\n",
       " '0cb8qfjaa',\n",
       " '0d878a889c',\n",
       " '0dbhjzdw0lbsjbi40gxm0d0p5krhv8xinqli53__wqbahs8zx4mjhw5vwrkpxfoeks',\n",
       " '0laviqu2b',\n",
       " '10',\n",
       " '100',\n",
       " '1000',\n",
       " '10000000',\n",
       " '1000000000',\n",
       " '100000415527985',\n",
       " '100005244783212',\n",
       " '100007085325116',\n",
       " '10001',\n",
       " '100877300245414',\n",
       " '101721377578919894134',\n",
       " '10200253113705769',\n",
       " '1030',\n",
       " '104999962146104962510',\n",
       " '10626048',\n",
       " '10626835',\n",
       " '106865403',\n",
       " '107297364',\n",
       " '1073741825',\n",
       " '1073741828',\n",
       " '1073741830',\n",
       " '1073741943',\n",
       " '108k',\n",
       " '109',\n",
       " '10b35481',\n",
       " '11',\n",
       " '1111',\n",
       " '1111111111111111111',\n",
       " '111719098841907',\n",
       " '111982027348137311818',\n",
       " '112720997191206369631',\n",
       " '11cpwb',\n",
       " '11th',\n",
       " '12',\n",
       " '123',\n",
       " '124',\n",
       " '124923004',\n",
       " '126',\n",
       " '127',\n",
       " '128gb',\n",
       " '12year',\n",
       " '13',\n",
       " '13017194',\n",
       " '131275322914',\n",
       " '131338190916',\n",
       " '1337',\n",
       " '1340488',\n",
       " '1340489',\n",
       " '1340490',\n",
       " '1340491',\n",
       " '1340492',\n",
       " '1340493',\n",
       " '1340494',\n",
       " '1340499',\n",
       " '1340500',\n",
       " '1340502',\n",
       " '1340503',\n",
       " '1340504',\n",
       " '1340517',\n",
       " '1340518',\n",
       " '1340519',\n",
       " '1340520',\n",
       " '1340521',\n",
       " '1340522',\n",
       " '1340523',\n",
       " '1340524',\n",
       " '134470083389909',\n",
       " '14',\n",
       " '1408122684',\n",
       " '1415297812',\n",
       " '1442646731',\n",
       " '1446084',\n",
       " '1461302180794905',\n",
       " '1495323920744243',\n",
       " '1496241863981208',\n",
       " '1496273723978022',\n",
       " '1498561870415874',\n",
       " '14gkvdo',\n",
       " '15',\n",
       " '16',\n",
       " '161620527267482',\n",
       " '16gb',\n",
       " '17',\n",
       " '171183229277',\n",
       " '1727483389',\n",
       " '17yr',\n",
       " '18',\n",
       " '19',\n",
       " '19255',\n",
       " '1990',\n",
       " '19924',\n",
       " '1b',\n",
       " '1bi',\n",
       " '1billiom',\n",
       " '1billion',\n",
       " '1bsefqe',\n",
       " '1fhenqx1twqm153v2ptayiejnealahzvem',\n",
       " '1firo',\n",
       " '1hmvtx',\n",
       " '1k',\n",
       " '1m',\n",
       " '1m00s',\n",
       " '1min',\n",
       " '20',\n",
       " '200',\n",
       " '2004',\n",
       " '2005',\n",
       " '2008',\n",
       " '2009',\n",
       " '200k',\n",
       " '200mm',\n",
       " '2010',\n",
       " '2011',\n",
       " '2012',\n",
       " '2012430',\n",
       " '2012bitches',\n",
       " '2013',\n",
       " '2014',\n",
       " '201470069872822',\n",
       " '2015',\n",
       " '2016',\n",
       " '2017',\n",
       " '207230212795137',\n",
       " '21',\n",
       " '210',\n",
       " '2177367',\n",
       " '229508',\n",
       " '23',\n",
       " '23active',\n",
       " '23awesome',\n",
       " '23eminem',\n",
       " '23everydayimvaping',\n",
       " '23giraffebruuh',\n",
       " '23king',\n",
       " '23kinglothedancer',\n",
       " '23lmfao',\n",
       " '23lovethewayyoulie',\n",
       " '23rapgod',\n",
       " '23rt',\n",
       " '23share',\n",
       " '24',\n",
       " '24398',\n",
       " '243a',\n",
       " '247',\n",
       " '25',\n",
       " '250',\n",
       " '25000',\n",
       " '251638183951',\n",
       " '25874',\n",
       " '25th',\n",
       " '26',\n",
       " '26032883',\n",
       " '26t22',\n",
       " '27',\n",
       " '279',\n",
       " '28',\n",
       " '29',\n",
       " '2asfn9shghk',\n",
       " '2b',\n",
       " '2b4wywphi8c',\n",
       " '2billion',\n",
       " '2f',\n",
       " '2fen',\n",
       " '2flist_of_most_viewed_youtube_videos',\n",
       " '2fwiki',\n",
       " '2m19s',\n",
       " '2nd',\n",
       " '2parale',\n",
       " '2tggp3pv6l',\n",
       " '2x10',\n",
       " '2zme8f',\n",
       " '30',\n",
       " '300',\n",
       " '3000',\n",
       " '301',\n",
       " '302703146601369',\n",
       " '30th',\n",
       " '313327',\n",
       " '313454548839369',\n",
       " '315',\n",
       " '31st',\n",
       " '320',\n",
       " '322',\n",
       " '327568907427561',\n",
       " '32gb',\n",
       " '33',\n",
       " '333',\n",
       " '333607726823679',\n",
       " '333608120156973',\n",
       " '33gxrf',\n",
       " '342',\n",
       " '35',\n",
       " '360',\n",
       " '365',\n",
       " '36loseweight',\n",
       " '385',\n",
       " '387',\n",
       " '3873',\n",
       " '389088',\n",
       " '39',\n",
       " '390',\n",
       " '390875584405933',\n",
       " '391725794320912',\n",
       " '3a',\n",
       " '3bie',\n",
       " '3bkeywords',\n",
       " '3bqid',\n",
       " '3bsr',\n",
       " '3d',\n",
       " '3m',\n",
       " '3m40s',\n",
       " '3m57s',\n",
       " '3rd',\n",
       " '40',\n",
       " '4000',\n",
       " '4000dollars',\n",
       " '40beuutvu2zkxk4utgpz8k',\n",
       " '41',\n",
       " '421',\n",
       " '43',\n",
       " '433',\n",
       " '4344749',\n",
       " '4436607',\n",
       " '4477063',\n",
       " '447935454150',\n",
       " '4483179854075',\n",
       " '448800865296855',\n",
       " '45',\n",
       " '4500',\n",
       " '46',\n",
       " '4604617',\n",
       " '476000',\n",
       " '48051',\n",
       " '482',\n",
       " '484',\n",
       " '490',\n",
       " '492',\n",
       " '4e',\n",
       " '4g',\n",
       " '4gb',\n",
       " '4m11s',\n",
       " '4netjobs',\n",
       " '4s',\n",
       " '4shared',\n",
       " '4snjqp',\n",
       " '4th',\n",
       " '50',\n",
       " '500',\n",
       " '5000',\n",
       " '500k',\n",
       " '500m',\n",
       " '505b0232',\n",
       " '5094',\n",
       " '50k',\n",
       " '510',\n",
       " '515',\n",
       " '521',\n",
       " '5242575',\n",
       " '5277478',\n",
       " '5287',\n",
       " '53331',\n",
       " '5337555197',\n",
       " '53481',\n",
       " '543627485763966',\n",
       " '55',\n",
       " '550',\n",
       " '5575096797',\n",
       " '55mm',\n",
       " '566',\n",
       " '57',\n",
       " '58',\n",
       " '5800',\n",
       " '5af506e1',\n",
       " '5bgkg2iwphzohwaeuesrwnegqg_labco7rw9wfx8hao',\n",
       " '5c',\n",
       " '5c2f',\n",
       " '5ggs_m_9ma3ti40fs6mvpics',\n",
       " '5million',\n",
       " '5s',\n",
       " '5th',\n",
       " '5tu9gn1l310',\n",
       " '60',\n",
       " '600',\n",
       " '600m',\n",
       " '613000',\n",
       " '616375350',\n",
       " '6174122',\n",
       " '629',\n",
       " '629410220489046',\n",
       " '633807',\n",
       " '636',\n",
       " '6381501',\n",
       " '6401116',\n",
       " '661',\n",
       " '666',\n",
       " '674732645945877',\n",
       " '682',\n",
       " '694',\n",
       " '6_h0m5sayho',\n",
       " '6th',\n",
       " '700',\n",
       " '704682339621282',\n",
       " '710',\n",
       " '710000',\n",
       " '73231344',\n",
       " '733634264',\n",
       " '733949243353321',\n",
       " '734237113324534',\n",
       " '74',\n",
       " '750',\n",
       " '753',\n",
       " '754989901225153',\n",
       " '764484966942313',\n",
       " '775510675841486',\n",
       " '783',\n",
       " '79',\n",
       " '7in',\n",
       " '7k',\n",
       " '800',\n",
       " '82',\n",
       " '821',\n",
       " '824',\n",
       " '8252267209931889',\n",
       " '832000',\n",
       " '84',\n",
       " '85',\n",
       " '851',\n",
       " '857',\n",
       " '860',\n",
       " '868',\n",
       " '8692160',\n",
       " '87',\n",
       " '870',\n",
       " '88',\n",
       " '884',\n",
       " '8851222',\n",
       " '898',\n",
       " '89___',\n",
       " '89c',\n",
       " '89iyec7nrwp5nytno5u7amhvmflutggl',\n",
       " '8a',\n",
       " '8bit',\n",
       " '90',\n",
       " '90000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000',\n",
       " '902099',\n",
       " '9082175',\n",
       " '9107',\n",
       " '911',\n",
       " '920',\n",
       " '9277547',\n",
       " '936868579660284',\n",
       " '937732262907249',\n",
       " '940',\n",
       " '950',\n",
       " '969',\n",
       " '999999999',\n",
       " '9bzkp7q19f0',\n",
       " '9gag',\n",
       " '9nl',\n",
       " '_0f9fa8aa',\n",
       " '__',\n",
       " '______________________',\n",
       " '______________________________',\n",
       " '__killuminati94',\n",
       " '_bzszz',\n",
       " '_chris_cz',\n",
       " '_fphgk5zllsvdqv0zuf0mb',\n",
       " '_gibu',\n",
       " '_o3h',\n",
       " '_ry6f57sprnd2xv',\n",
       " '_self',\n",
       " '_thqbeum69aqup1ih',\n",
       " '_trksid',\n",
       " '_vlczzrg8vgctlpsd9ongewhj8',\n",
       " 'a0qouc7q48v3_qiaaabpugaaacsqar0_vgoqwqxjmpuyvkosf3k',\n",
       " 'a7',\n",
       " 'aa',\n",
       " 'aaaaaaa',\n",
       " 'aaas',\n",
       " 'aavpwj9',\n",
       " 'abbas',\n",
       " 'ablaze',\n",
       " 'able',\n",
       " 'abominable',\n",
       " 'abomination',\n",
       " 'abonner',\n",
       " 'about',\n",
       " 'above',\n",
       " 'absolute',\n",
       " 'absolutely',\n",
       " 'absorbing',\n",
       " 'abuse',\n",
       " 'abused',\n",
       " 'abuses',\n",
       " 'abusive',\n",
       " 'abusue',\n",
       " 'ac',\n",
       " 'acaer',\n",
       " 'acceptance',\n",
       " 'access',\n",
       " 'accessories',\n",
       " 'accidental',\n",
       " 'accomplished',\n",
       " 'account',\n",
       " 'accounts',\n",
       " 'achieve',\n",
       " 'achieved',\n",
       " 'acidic',\n",
       " 'aclk',\n",
       " 'acn2g',\n",
       " 'acoustic',\n",
       " 'acquire',\n",
       " 'acquiring',\n",
       " 'across',\n",
       " 'act',\n",
       " 'acting',\n",
       " 'activates',\n",
       " 'active',\n",
       " 'actor',\n",
       " 'actorid',\n",
       " 'actors',\n",
       " 'actresses',\n",
       " 'actual',\n",
       " 'actually',\n",
       " 'ad',\n",
       " 'adam',\n",
       " 'adapt',\n",
       " 'add',\n",
       " 'addicting',\n",
       " 'adding',\n",
       " 'addition',\n",
       " 'adele',\n",
       " 'adf',\n",
       " 'adhoc',\n",
       " 'admirable',\n",
       " 'admire',\n",
       " 'admit',\n",
       " 'admitting',\n",
       " 'adore',\n",
       " 'adoult',\n",
       " 'adroid',\n",
       " 'adsense',\n",
       " 'adult',\n",
       " 'adurl',\n",
       " 'advance',\n",
       " 'advertise',\n",
       " 'advertisements',\n",
       " 'advertisiments',\n",
       " 'advertising',\n",
       " 'advice',\n",
       " 'affiliated',\n",
       " 'affiliateid',\n",
       " 'afflicted',\n",
       " 'afford',\n",
       " 'afiliati',\n",
       " 'afiliere',\n",
       " 'afqjcngkm',\n",
       " 'afraid',\n",
       " 'africa',\n",
       " 'african',\n",
       " 'africans',\n",
       " 'after',\n",
       " 'aftermath',\n",
       " 'again',\n",
       " 'against',\n",
       " 'age',\n",
       " 'ago',\n",
       " 'agree',\n",
       " 'agreeable',\n",
       " 'ah',\n",
       " 'ahead',\n",
       " 'ahhh',\n",
       " 'ai',\n",
       " 'aid',\n",
       " 'aiiima',\n",
       " 'aimbwbfqbzg',\n",
       " 'ain',\n",
       " 'air',\n",
       " 'airlines',\n",
       " 'airplane',\n",
       " 'aka',\n",
       " 'al',\n",
       " 'album',\n",
       " 'alcoholic',\n",
       " 'alert',\n",
       " 'alex',\n",
       " 'alfred',\n",
       " 'ali',\n",
       " 'alive',\n",
       " 'all',\n",
       " 'allot',\n",
       " 'allow',\n",
       " 'allows',\n",
       " 'allways',\n",
       " 'almond',\n",
       " 'almost',\n",
       " 'alo',\n",
       " 'aloidia',\n",
       " 'alone',\n",
       " 'alot',\n",
       " 'already',\n",
       " 'alright',\n",
       " 'also',\n",
       " 'alternate',\n",
       " 'alvar',\n",
       " 'always',\n",
       " 'am',\n",
       " 'amazed',\n",
       " 'amazement',\n",
       " 'amazing',\n",
       " 'amazon',\n",
       " 'ambition',\n",
       " 'ambitious',\n",
       " 'amendment',\n",
       " 'america',\n",
       " 'american',\n",
       " 'americans',\n",
       " 'amiable',\n",
       " 'amount',\n",
       " 'amp',\n",
       " 'amy',\n",
       " 'an',\n",
       " 'ana',\n",
       " 'anaconda',\n",
       " 'analyst',\n",
       " 'anand',\n",
       " 'ancestors',\n",
       " 'and',\n",
       " 'anderson',\n",
       " 'andrew',\n",
       " 'andrijamatf',\n",
       " 'android',\n",
       " 'angel',\n",
       " 'angels',\n",
       " 'angry',\n",
       " 'animal',\n",
       " 'animals',\n",
       " 'animation',\n",
       " 'animations',\n",
       " 'animator',\n",
       " 'anime',\n",
       " 'animes',\n",
       " 'annoyed',\n",
       " 'annoying',\n",
       " 'annoys',\n",
       " 'another',\n",
       " 'ans',\n",
       " 'answer',\n",
       " 'anthem',\n",
       " 'antrobofficial',\n",
       " 'anxious',\n",
       " 'any',\n",
       " 'anybody',\n",
       " 'anymore',\n",
       " 'anyone',\n",
       " 'anything',\n",
       " 'anyway',\n",
       " 'anywhere',\n",
       " 'anywon',\n",
       " 'aod64_1ofc7seh_1pop',\n",
       " 'aplica',\n",
       " 'apocalypse',\n",
       " 'apologies',\n",
       " 'apostles',\n",
       " 'app',\n",
       " 'apparel',\n",
       " 'apparently',\n",
       " 'applause',\n",
       " 'apple',\n",
       " 'applied',\n",
       " 'applocker',\n",
       " 'appoints',\n",
       " 'appreciate',\n",
       " 'appreciated',\n",
       " 'apprecitate',\n",
       " 'approve',\n",
       " 'apps',\n",
       " 'arbitrate',\n",
       " 'are',\n",
       " 'aren',\n",
       " 'arguements',\n",
       " 'arive',\n",
       " 'arkglzjqup0',\n",
       " 'arm',\n",
       " 'army',\n",
       " 'around',\n",
       " 'arrogant',\n",
       " 'arrowgance',\n",
       " 'art',\n",
       " 'artady',\n",
       " 'articles',\n",
       " 'artist',\n",
       " 'artists',\n",
       " 'as',\n",
       " 'aseris',\n",
       " 'asia',\n",
       " 'asian',\n",
       " 'asinine',\n",
       " 'ask',\n",
       " 'asked',\n",
       " 'asking',\n",
       " 'aslamu',\n",
       " 'aspiring',\n",
       " 'aspx',\n",
       " 'ass',\n",
       " 'assume',\n",
       " 'astauand',\n",
       " 'aswell',\n",
       " 'at',\n",
       " 'atlastatlas',\n",
       " 'attacks',\n",
       " 'attention',\n",
       " 'auburn',\n",
       " 'audio',\n",
       " 'audiojungle',\n",
       " 'audit',\n",
       " 'audition',\n",
       " 'auditiondetail_',\n",
       " 'auditioning',\n",
       " 'auditions',\n",
       " 'aunt',\n",
       " 'austin',\n",
       " 'australia',\n",
       " 'authenticviews',\n",
       " 'authority',\n",
       " 'auto',\n",
       " 'autotune',\n",
       " 'autotuned',\n",
       " 'avaaz',\n",
       " 'available',\n",
       " 'avicii',\n",
       " 'avoid',\n",
       " 'aw',\n",
       " 'awards',\n",
       " 'aware',\n",
       " 'away',\n",
       " 'aways',\n",
       " 'awesom',\n",
       " 'awesome',\n",
       " 'awesomeness',\n",
       " 'awesoooome',\n",
       " 'awesum',\n",
       " 'awful',\n",
       " 'awsome',\n",
       " 'axeljonssons',\n",
       " 'axiomatic',\n",
       " 'axy665',\n",
       " 'aye',\n",
       " 'ayyy',\n",
       " 'azerbaijan',\n",
       " 'b00ecvf93g',\n",
       " 'b00mppqhri',\n",
       " 'b3',\n",
       " 'b5',\n",
       " 'b5t',\n",
       " 'b7b',\n",
       " 'b8l',\n",
       " 'ba',\n",
       " 'baba',\n",
       " 'babe',\n",
       " 'baby',\n",
       " 'back',\n",
       " 'bad',\n",
       " 'bady',\n",
       " 'ball',\n",
       " 'ballad',\n",
       " 'balls',\n",
       " 'band',\n",
       " 'bands',\n",
       " 'bang',\n",
       " 'bangers',\n",
       " 'banging',\n",
       " 'bangladesh',\n",
       " 'barnesandnoble',\n",
       " 'bars',\n",
       " 'base',\n",
       " 'based',\n",
       " 'basically',\n",
       " 'basketball',\n",
       " 'bass',\n",
       " 'bastard',\n",
       " 'bd3721315',\n",
       " 'bdp',\n",
       " 'be',\n",
       " 'beast',\n",
       " 'beat',\n",
       " 'beatboxing',\n",
       " 'beaties',\n",
       " 'beating',\n",
       " 'beats',\n",
       " 'beautiful',\n",
       " 'beauty',\n",
       " 'because',\n",
       " 'become',\n",
       " 'becomes',\n",
       " 'been',\n",
       " 'before',\n",
       " 'begin',\n",
       " 'beginning',\n",
       " 'behavior',\n",
       " 'behind',\n",
       " 'behold',\n",
       " 'beibs',\n",
       " 'being',\n",
       " 'belarus',\n",
       " 'belgique',\n",
       " 'belgium',\n",
       " 'believe',\n",
       " 'believemefilm',\n",
       " 'believer',\n",
       " 'believing',\n",
       " 'bella',\n",
       " 'belle',\n",
       " 'belly',\n",
       " 'below',\n",
       " 'belrus',\n",
       " 'beneath',\n",
       " 'bengal',\n",
       " 'bennett',\n",
       " 'berzerk',\n",
       " 'besloor',\n",
       " 'best',\n",
       " 'bet',\n",
       " 'beta',\n",
       " 'betfair',\n",
       " 'better',\n",
       " 'between',\n",
       " 'beutiful',\n",
       " 'beware',\n",
       " 'bf4',\n",
       " 'bg',\n",
       " 'bgq',\n",
       " 'bieber',\n",
       " 'big',\n",
       " 'bigboss286',\n",
       " 'bigelow',\n",
       " 'bigger',\n",
       " 'bighit',\n",
       " 'bikini',\n",
       " 'bil',\n",
       " 'bilion',\n",
       " 'billboard',\n",
       " 'billie',\n",
       " 'billion',\n",
       " 'billions',\n",
       " 'billon',\n",
       " 'bills',\n",
       " 'binbox',\n",
       " 'bing',\n",
       " 'birtgday',\n",
       " 'birthday',\n",
       " 'bisexual',\n",
       " 'bishopsgravemarker',\n",
       " 'bit',\n",
       " 'bitch',\n",
       " 'bitcoins',\n",
       " 'bite',\n",
       " 'black',\n",
       " 'blanc',\n",
       " 'blank',\n",
       " 'blast',\n",
       " 'bleach',\n",
       " 'bless',\n",
       " 'blessing',\n",
       " 'block',\n",
       " 'blog',\n",
       " 'blogfa',\n",
       " 'blogspot',\n",
       " 'blond',\n",
       " 'blonde',\n",
       " 'blow',\n",
       " 'blows',\n",
       " 'blue',\n",
       " 'blushing',\n",
       " 'boa',\n",
       " 'boaconic',\n",
       " 'bocilile',\n",
       " 'body',\n",
       " 'bogdan',\n",
       " 'bomb',\n",
       " 'bones',\n",
       " 'bonus',\n",
       " 'boobs',\n",
       " 'book',\n",
       " 'bookies',\n",
       " 'bookmakers',\n",
       " 'boomerul',\n",
       " 'boooobs',\n",
       " 'boost',\n",
       " 'border',\n",
       " 'borderlands',\n",
       " 'bored',\n",
       " 'boring',\n",
       " 'born',\n",
       " 'bother',\n",
       " 'bots',\n",
       " 'bottom',\n",
       " 'bout',\n",
       " 'bowl',\n",
       " 'box',\n",
       " 'boxium',\n",
       " 'boy',\n",
       " 'boyfriend',\n",
       " 'boys',\n",
       " 'bps',\n",
       " 'br',\n",
       " 'brah',\n",
       " 'brain',\n",
       " 'brake',\n",
       " 'brand',\n",
       " 'brave',\n",
       " 'brazil',\n",
       " 'brazilian',\n",
       " 'break',\n",
       " 'breaken',\n",
       " 'breaks',\n",
       " 'breath',\n",
       " 'brew',\n",
       " 'briefs',\n",
       " 'bring',\n",
       " 'bringing',\n",
       " 'brings',\n",
       " 'brinkman',\n",
       " 'british',\n",
       " 'britishs',\n",
       " 'broken',\n",
       " 'brooooo',\n",
       " 'brother',\n",
       " 'brotherhood',\n",
       " 'brothers',\n",
       " 'brought',\n",
       " 'browser',\n",
       " 'brt0u5',\n",
       " 'brutally',\n",
       " 'bs',\n",
       " 'btw',\n",
       " 'bubblews',\n",
       " 'buchmair',\n",
       " 'bucket',\n",
       " 'bucks',\n",
       " 'buggti',\n",
       " 'build',\n",
       " 'bulgaria',\n",
       " 'bumps',\n",
       " 'bunch',\n",
       " 'burda',\n",
       " 'burder',\n",
       " 'burned',\n",
       " 'burst',\n",
       " 'bus',\n",
       " 'business',\n",
       " 'busyglide',\n",
       " 'but',\n",
       " 'butalabs',\n",
       " 'butt',\n",
       " 'button',\n",
       " 'butts',\n",
       " 'buy',\n",
       " 'buys',\n",
       " 'buzz',\n",
       " 'bv',\n",
       " 'bvm',\n",
       " 'bxrosr',\n",
       " 'by',\n",
       " 'c3',\n",
       " 'c349',\n",
       " 'ca',\n",
       " 'cabelo',\n",
       " 'cachebuster',\n",
       " 'cad',\n",
       " 'call',\n",
       " 'called',\n",
       " 'calls',\n",
       " 'came',\n",
       " 'camera',\n",
       " 'cameraman',\n",
       " 'camp',\n",
       " 'campid',\n",
       " 'can',\n",
       " 'canal',\n",
       " 'cancer',\n",
       " 'canibus',\n",
       " 'cant',\n",
       " 'canvas',\n",
       " 'cap',\n",
       " 'capitalized',\n",
       " 'car',\n",
       " 'card',\n",
       " 'cards',\n",
       " 'care',\n",
       " 'career',\n",
       " 'cares',\n",
       " 'caroline',\n",
       " 'case',\n",
       " 'cash',\n",
       " 'catch',\n",
       " 'catchy',\n",
       " 'categories',\n",
       " 'cats',\n",
       " 'cause',\n",
       " 'caution',\n",
       " 'cazzy',\n",
       " 'cd',\n",
       " 'cd92db3f4',\n",
       " 'ce',\n",
       " 'cease',\n",
       " 'cece',\n",
       " 'celeb',\n",
       " 'celebrate',\n",
       " 'celebrated',\n",
       " 'celebration',\n",
       " 'celebrity',\n",
       " 'censor',\n",
       " 'cent',\n",
       " 'central',\n",
       " 'cents',\n",
       " 'cereal',\n",
       " 'certain',\n",
       " 'certification',\n",
       " 'cevxzvsjlk8',\n",
       " 'cge',\n",
       " 'chacking',\n",
       " 'chainise',\n",
       " 'challenge',\n",
       " 'challenges',\n",
       " 'champion',\n",
       " 'chance',\n",
       " 'chanel',\n",
       " 'chanell',\n",
       " 'change',\n",
       " 'changeable',\n",
       " 'chanicka',\n",
       " 'channel',\n",
       " 'channels',\n",
       " 'channnnnnelll',\n",
       " 'chanson',\n",
       " 'chap',\n",
       " 'characterized',\n",
       " 'charity',\n",
       " 'charley',\n",
       " 'charlie',\n",
       " 'charlieee',\n",
       " 'chaste',\n",
       " 'chaîne',\n",
       " 'chcfcvzfzfbvzdr',\n",
       " 'cheat',\n",
       " 'cheating',\n",
       " 'cheats',\n",
       " 'check',\n",
       " 'checked',\n",
       " 'checking',\n",
       " 'cheer',\n",
       " 'cheers',\n",
       " 'cheetos',\n",
       " 'cheilith',\n",
       " 'chesture',\n",
       " 'chhanel',\n",
       " 'chick',\n",
       " 'child',\n",
       " 'children',\n",
       " 'chillpal',\n",
       " 'chills',\n",
       " 'chillstep',\n",
       " 'china',\n",
       " 'chinese',\n",
       " 'ching',\n",
       " 'chiptunes',\n",
       " 'choice',\n",
       " 'chooses',\n",
       " 'chorenn',\n",
       " 'chorus',\n",
       " 'chose',\n",
       " 'chrck',\n",
       " 'christ',\n",
       " 'christianity',\n",
       " 'christians',\n",
       " 'christmas',\n",
       " 'chubby',\n",
       " 'chubbz',\n",
       " 'chuck',\n",
       " 'cid',\n",
       " 'cirus',\n",
       " 'citizen',\n",
       " ...]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get the feature names\n",
    "cv.get_feature_names()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model Building"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, df_y, test_size=0.33, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1310x4454 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 17525 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9195046439628483"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Naive Bayes Classifier\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "clf = MultinomialNB()\n",
    "clf.fit(X_train,y_train)\n",
    "clf.score(X_test,y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy of Model 91.95046439628483 %\n"
     ]
    }
   ],
   "source": [
    "# Accuracy of our Model\n",
    "print(\"Accuracy of Model\",clf.score(X_test,y_test)*100,\"%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,\n",
       "       1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,\n",
       "       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,\n",
       "       0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,\n",
       "       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,\n",
       "       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n",
       "       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,\n",
       "       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,\n",
       "       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,\n",
       "       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,\n",
       "       1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,\n",
       "       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,\n",
       "       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,\n",
       "       0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,\n",
       "       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,\n",
       "       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,\n",
       "       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,\n",
       "       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,\n",
       "       0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,\n",
       "       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,\n",
       "       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,\n",
       "       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,\n",
       "       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,\n",
       "       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,\n",
       "       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,\n",
       "       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,\n",
       "       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,\n",
       "       1, 1, 1, 0, 1, 0, 0, 0])"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Predicting with our model\n",
    "clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sample Prediciton of A Text if it is a spam\n",
    "comment = [\"Check this out\"]\n",
    "vect = cv.transform(comment).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1])"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "class_dict = {'ham':0,'spam':1}\n",
    "clf.predict(vect)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Spam\n"
     ]
    }
   ],
   "source": [
    "if clf.predict(vect) == 1:\n",
    "    print(\"Spam\")\n",
    "else:\n",
    "    print(\"Ham\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0])"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Sample Prediciton 2\n",
    "comment1 = [\"Great song Friend\"]\n",
    "vect = cv.transform(comment1).toarray()\n",
    "clf.predict(vect)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save the Model "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "naivebayesML = open(\"Spam_model.pkl\",\"wb\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(clf,naivebayesML)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "naivebayesML.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "ytb_model = open(\"Spam_model.pkl\",\"rb\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_model = pickle.load(ytb_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1])"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 3 Sample Prediciton \n",
    "comment2 = [\"Hey Music Fans I really appreciate all of you,but see this song too\"]\n",
    "vect = cv.transform(comment2).toarray()\n",
    "new_model.predict(vect)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Spam\n"
     ]
    }
   ],
   "source": [
    "if new_model.predict(vect) == 1:\n",
    "    print(\"Spam\")\n",
    "else:\n",
    "    print(\"Ham\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Thanks\n",
    "# Jesus Saves @ JCharisTech\n",
    "# By Jesse JCharis\n",
    "# J-Secur1ty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
